diff --git a/.circleci/config.yml b/.circleci/config.yml
index c9e7b4a488197..10eae6e9bdf24 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,6 +38,8 @@ jobs:
           command: |
             pip --version
             pip install --progress-bar off -r requirements/doc-requirements.txt pytest pytest-cov plotly .[gateway]
+          environment:
+            PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
       - run:
           name: Build documentation
           working_directory: docs
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index d2d7db5e7eca5..02982c55c7286 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -20,12 +20,17 @@
     "prettier.configPath": "/workspaces/mlflow/mlflow/server/js/.prettierrc.js"
   },
   "extensions": [
+    "charliermarsh.ruff",
     "dbaeumer.vscode-eslint",
     "eamodio.gitlens",
     "esbenp.prettier-vscode",
+    "GitHub.copilot",
+    "GitHub.copilot-chat",
     "GitHub.vscode-pull-request-github",
     "ms-azuretools.vscode-docker",
+    "ms-python.black-formatter",
     "ms-python.python",
+    "ms-toolsai.jupyter",
     "oderwat.indent-rainbow",
     "PKief.material-icon-theme",
     "ritwickdey.LiveServer",
diff --git a/.devcontainer/pip-compile.sh b/.devcontainer/pip-compile.sh
index 698b9fa514d2c..1312b8b0db821 100755
--- a/.devcontainer/pip-compile.sh
+++ b/.devcontainer/pip-compile.sh
@@ -5,13 +5,15 @@ set -ex
 pip install pip-tools
 
 cd requirements
+echo ipykernel >> /tmp/requirements.txt
 pip-compile --verbose \
    --output-file /tmp/output.txt \
    skinny-requirements.txt \
    core-requirements.txt \
-   doc-requirements.txt \
+   doc-min-requirements.txt \
    test-requirements.txt \
-   lint-requirements.txt
+   lint-requirements.txt \
+   /tmp/requirements.txt
 
 # Add a timestamp at the beginning of the file
 echo "# Created at: $(date -u +"%F %T %Z")" | cat - /tmp/output.txt > /tmp/requirements.txt
diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt
index 6387ce917dd3f..8db32b7b2b795 100644
--- a/.devcontainer/requirements.txt
+++ b/.devcontainer/requirements.txt
@@ -1,419 +1,632 @@
-# Created at: 2022-11-16 04:25:48 UTC
+# Created at: 2023-10-24 08:12:52 UTC
 #
-# This file is autogenerated by pip-compile with python 3.8
-# To update, run:
+# This file is autogenerated by pip-compile with Python 3.8
+# by the following command:
 #
-#    pip-compile --output-file=/tmp/output.txt core-requirements.txt doc-requirements.txt lint-requirements.txt skinny-requirements.txt test-requirements.txt
+#    pip-compile --output-file=/tmp/output.txt /tmp/requirements.txt core-requirements.txt doc-min-requirements.txt lint-requirements.txt skinny-requirements.txt test-requirements.txt
 #
-alabaster==0.7.12
+absl-py==2.0.0
+    # via rouge-score
+aiofiles==23.2.1
+    # via mlserver
+aiohttp==3.8.6
+    # via
+    #   datasets
+    #   fsspec
+    #   tritonclient
+aiokafka==0.8.1
+    # via mlserver
+aiosignal==1.3.1
+    # via aiohttp
+alabaster==0.7.13
     # via sphinx
-alembic==1.8.1
-    # via -r core-requirements.txt
+alembic==1.12.0
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
+anyio==4.0.0
+    # via starlette
 astroid==2.11.7
     # via pylint
-asttokens==2.1.0
+asttokens==2.4.0
     # via stack-data
-attrs==22.1.0
-    # via pytest
-azure-core==1.26.1
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   aiokafka
+attrs==23.1.0
+    # via aiohttp
+azure-core==1.29.5
     # via
     #   azure-identity
     #   azure-storage-blob
-    #   msrest
-azure-identity==1.12.0
+    #   azure-storage-file-datalake
+azure-identity==1.14.1
     # via -r test-requirements.txt
-azure-storage-blob==12.14.1
+azure-storage-blob==12.18.3
+    # via
+    #   -r test-requirements.txt
+    #   azure-storage-file-datalake
+azure-storage-file-datalake==12.13.2
     # via -r test-requirements.txt
-babel==2.11.0
+babel==2.13.0
     # via sphinx
 backcall==0.2.0
     # via ipython
-black[jupyter]==22.3.0
+black[jupyter]==23.7.0
+    # via
+    #   -r lint-requirements.txt
+    #   black
+    #   blacken-docs
+blacken-docs==1.16.0
     # via -r lint-requirements.txt
-boto3==1.26.10
+boto3==1.28.69
     # via moto
-botocore==1.29.10
+botocore==1.31.69
     # via
     #   boto3
     #   moto
     #   s3transfer
-certifi==2022.12.7
+brotli==1.1.0
+    # via geventhttpclient
+certifi==2023.7.22
     # via
-    #   msrest
+    #   geventhttpclient
     #   requests
-cffi==1.15.1
+cffi==1.16.0
     # via cryptography
-cfgv==3.3.1
+cfgv==3.4.0
     # via pre-commit
-charset-normalizer==2.1.1
-    # via requests
-click==8.1.3
+charset-normalizer==3.3.1
+    # via
+    #   aiohttp
+    #   requests
+click==8.1.7
     # via
     #   -r skinny-requirements.txt
     #   black
     #   databricks-cli
     #   flask
+    #   mlflow
+    #   mlserver
+    #   nltk
     #   sphinx-click
     #   typer
-cloudpickle==2.2.0
+    #   uvicorn
+cloudpickle==2.2.1
     # via
     #   -r skinny-requirements.txt
     #   hyperopt
+    #   mlflow
     #   shap
 colorama==0.4.6
     # via
     #   sphinx-autobuild
     #   typer
+comm==0.1.4
+    # via
+    #   ipykernel
+    #   ipywidgets
 commonmark==0.9.1
     # via rich
-contourpy==1.0.6
+contourpy==1.1.1
     # via matplotlib
-coverage[toml]==6.5.0
-    # via pytest-cov
-cryptography==38.0.3
+coverage[toml]==7.3.2
+    # via
+    #   coverage
+    #   pytest-cov
+cryptography==41.0.4
     # via
     #   azure-identity
     #   azure-storage-blob
     #   moto
     #   msal
     #   pyjwt
-cycler==0.11.0
+cuda-python==12.3.0
+    # via tritonclient
+cycler==0.12.1
     # via matplotlib
 databricks-cli @ git+https://github.com/databricks/databricks-cli.git
     # via
     #   -r skinny-requirements.txt
     #   -r test-requirements.txt
+    #   mlflow
+databricks-sdk==0.11.0
+    # via -r test-requirements.txt
+datasets==2.14.6
+    # via evaluate
+debugpy==1.8.0
+    # via ipykernel
 decorator==5.1.1
     # via ipython
-dill==0.3.6
-    # via pylint
-distlib==0.3.6
+dill==0.3.7
+    # via
+    #   datasets
+    #   evaluate
+    #   multiprocess
+    #   pylint
+distlib==0.3.7
     # via virtualenv
-docker==6.0.1
-    # via -r core-requirements.txt
+docker==6.1.3
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
 docutils==0.16
     # via
     #   rstcheck-core
     #   sphinx
     #   sphinx-click
 entrypoints==0.4
-    # via -r skinny-requirements.txt
-exceptiongroup==1.0.4
-    # via pytest
-executing==1.2.0
+    # via
+    #   -r skinny-requirements.txt
+    #   mlflow
+evaluate==0.4.1
+    # via -r test-requirements.txt
+exceptiongroup==1.1.3
+    # via
+    #   anyio
+    #   pytest
+executing==2.0.0
     # via stack-data
-filelock==3.8.0
+fastapi==0.89.1
+    # via mlserver
+filelock==3.12.4
     # via
     #   huggingface-hub
-    #   transformers
     #   virtualenv
-flaml==1.0.13
+flaml[automl]==2.1.1
     # via -r test-requirements.txt
-flask==2.2.2
-    # via -r core-requirements.txt
-fonttools==4.38.0
+flask==2.2.5
+    # via
+    #   -r core-requirements.txt
+    #   -r doc-min-requirements.txt
+    #   mlflow
+fonttools==4.43.1
     # via matplotlib
-future==0.18.2
+frozenlist==1.4.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec[http]==2023.10.0
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+future==0.18.3
     # via hyperopt
-gitdb==4.0.9
+gevent==23.9.1
+    # via geventhttpclient
+geventhttpclient==2.0.2
+    # via tritonclient
+gitdb==4.0.11
     # via gitpython
-gitpython==3.1.32
-    # via -r skinny-requirements.txt
-greenlet==2.0.1
-    # via sqlalchemy
-gunicorn==20.1.0 ; platform_system != "Windows"
-    # via -r core-requirements.txt
-huggingface-hub==0.10.1
-    # via transformers
+gitpython==3.1.40
+    # via
+    #   -r skinny-requirements.txt
+    #   mlflow
+greenlet==3.0.0
+    # via
+    #   gevent
+    #   sqlalchemy
+grpcio==1.59.0
+    # via
+    #   mlserver
+    #   py-grpc-prometheus
+gunicorn==21.2.0 ; platform_system != "Windows"
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
+h11==0.14.0
+    # via uvicorn
+huggingface-hub==0.18.0
+    # via
+    #   -r test-requirements.txt
+    #   datasets
+    #   evaluate
 hyperopt==0.2.7
     # via -r test-requirements.txt
-identify==2.5.8
+identify==2.5.30
     # via pre-commit
 idna==3.4
-    # via requests
+    # via
+    #   anyio
+    #   requests
+    #   yarl
 imagesize==1.4.1
     # via sphinx
-importlib-metadata==5.0.0
+importlib-metadata==6.8.0
     # via
     #   -r skinny-requirements.txt
     #   alembic
     #   flask
+    #   jupyter-client
     #   markdown
+    #   mlflow
     #   numba
-importlib-resources==5.10.0
-    # via alembic
-iniconfig==1.1.1
+importlib-resources==6.1.0
+    # via
+    #   alembic
+    #   matplotlib
+    #   mlserver
+iniconfig==2.0.0
     # via pytest
-ipython==8.6.0
+ipykernel==6.25.2
+    # via -r /tmp/requirements.txt
+ipython==8.12.3
     # via
     #   -r test-requirements.txt
     #   black
+    #   ipykernel
+    #   ipywidgets
+ipywidgets==8.1.1
+    # via -r test-requirements.txt
 isodate==0.6.1
-    # via msrest
-isort==5.10.1
+    # via
+    #   azure-storage-blob
+    #   azure-storage-file-datalake
+isort==5.12.0
     # via pylint
 itsdangerous==2.1.2
     # via flask
-jedi==0.18.1
+jedi==0.19.1
     # via ipython
 jinja2==3.0.3 ; platform_system != "Windows"
     # via
     #   -r core-requirements.txt
-    #   -r doc-requirements.txt
+    #   -r doc-min-requirements.txt
     #   flask
+    #   mlflow
     #   moto
     #   sphinx
 jmespath==1.0.1
     # via
     #   boto3
     #   botocore
-joblib==1.2.0
-    # via scikit-learn
+joblib==1.3.2
+    # via
+    #   nltk
+    #   scikit-learn
+jupyter-client==8.4.0
+    # via ipykernel
+jupyter-core==5.4.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+jupyterlab-widgets==3.0.9
+    # via ipywidgets
+kafka-python==2.0.2
+    # via aiokafka
 kaleido==0.2.1
     # via -r test-requirements.txt
-kiwisolver==1.4.4
+kiwisolver==1.4.5
     # via matplotlib
-lazy-object-proxy==1.8.0
+lazy-object-proxy==1.9.0
     # via astroid
-lightgbm==3.3.3
+lightgbm==4.1.0
     # via flaml
 livereload==2.6.3
     # via sphinx-autobuild
-llvmlite==0.39.1
+llvmlite==0.41.1
     # via numba
 mako==1.2.4
     # via alembic
-markdown==3.4.1
-    # via -r core-requirements.txt
-markupsafe==2.1.1
+markdown==3.5
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
+markupsafe==2.1.3
     # via
     #   jinja2
     #   mako
-    #   moto
     #   werkzeug
-matplotlib==3.6.2
-    # via -r core-requirements.txt
+matplotlib==3.7.3
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
 matplotlib-inline==0.1.6
-    # via ipython
+    # via
+    #   ipykernel
+    #   ipython
 mccabe==0.7.0
     # via pylint
-moto==4.0.9
+mlflow==2.7.1
+    # via mlserver-mlflow
+mlserver==1.3.5
+    # via
+    #   -r test-requirements.txt
+    #   mlserver-mlflow
+mlserver-mlflow==1.3.5
     # via -r test-requirements.txt
-msal==1.20.0
+moto==4.2.4
+    # via -r test-requirements.txt
+msal==1.24.1
     # via
     #   azure-identity
     #   msal-extensions
 msal-extensions==1.0.0
     # via azure-identity
-msrest==0.7.1
-    # via azure-storage-blob
-mypy-extensions==0.4.3
+multidict==6.0.4
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.15
+    # via
+    #   datasets
+    #   evaluate
+mypy-extensions==1.0.0
     # via black
-networkx==2.8.8
+nest-asyncio==1.5.8
+    # via ipykernel
+networkx==3.1
     # via hyperopt
-nodeenv==1.7.0
+nltk==3.8.1
+    # via
+    #   -r test-requirements.txt
+    #   rouge-score
+nodeenv==1.8.0
     # via pre-commit
-numba==0.56.4
+numba==0.58.1
     # via shap
-numpy==1.23.4
+numpy==1.24.4
     # via
     #   -r core-requirements.txt
     #   contourpy
+    #   datasets
+    #   evaluate
     #   flaml
     #   hyperopt
     #   lightgbm
     #   matplotlib
+    #   mlflow
+    #   mlserver
     #   numba
     #   pandas
     #   pyarrow
+    #   rouge-score
     #   scikit-learn
     #   scipy
     #   shap
-    #   transformers
+    #   tritonclient
     #   xgboost
 oauthlib==3.2.2
-    # via
-    #   databricks-cli
-    #   requests-oauthlib
-packaging==21.3
+    # via databricks-cli
+orjson==3.9.9
+    # via mlserver
+packaging==23.2
     # via
     #   -r skinny-requirements.txt
+    #   aiokafka
+    #   black
+    #   datasets
     #   docker
+    #   evaluate
+    #   gunicorn
     #   huggingface-hub
+    #   ipykernel
     #   matplotlib
+    #   mlflow
+    #   plotly
     #   pytest
     #   shap
     #   sphinx
-    #   transformers
-pandas==1.5.1
+pandas==2.0.3
     # via
     #   -r core-requirements.txt
+    #   datasets
+    #   evaluate
     #   flaml
+    #   mlflow
+    #   mlserver
     #   shap
 parso==0.8.3
     # via jedi
-pathspec==0.10.2
+pathspec==0.11.2
     # via black
 pexpect==4.8.0
     # via ipython
 pickleshare==0.7.5
     # via ipython
-pillow==9.3.0
+pillow==10.1.0
     # via
     #   -r test-requirements.txt
     #   matplotlib
-platformdirs==2.5.4
+platformdirs==3.11.0
     # via
     #   black
+    #   jupyter-core
     #   pylint
     #   virtualenv
-plotly==5.11.0
+plotly==5.17.0
     # via -r test-requirements.txt
-pluggy==1.0.0
+pluggy==1.3.0
     # via pytest
-portalocker==2.6.0
+portalocker==2.8.2
     # via msal-extensions
 pre-commit==2.20.0
     # via -r lint-requirements.txt
-prompt-toolkit==3.0.32
+prometheus-client==0.17.1
+    # via
+    #   py-grpc-prometheus
+    #   starlette-exporter
+prompt-toolkit==3.0.39
     # via ipython
-protobuf==4.21.9
-    # via -r skinny-requirements.txt
+protobuf==4.24.4
+    # via
+    #   -r skinny-requirements.txt
+    #   mlflow
+    #   mlserver
+psutil==5.9.6
+    # via
+    #   -r core-requirements.txt
+    #   ipykernel
 ptyprocess==0.7.0
     # via pexpect
 pure-eval==0.2.2
     # via stack-data
+py-grpc-prometheus==0.7.0
+    # via mlserver
 py4j==0.10.9.7
     # via hyperopt
-pyarrow==10.0.0
-    # via -r core-requirements.txt
+pyarrow==13.0.0
+    # via
+    #   -r core-requirements.txt
+    #   datasets
+    #   mlflow
 pycparser==2.21
     # via cffi
-pydantic==1.10.2
-    # via rstcheck-core
-pygments==2.15.0
+pydantic==1.10.13
+    # via
+    #   fastapi
+    #   rstcheck-core
+pygments==2.16.1
     # via
     #   ipython
     #   rich
     #   sphinx
-pyjwt[crypto]==2.6.0
+pyjwt[crypto]==2.8.0
     # via
     #   databricks-cli
     #   msal
 pylint==2.14.4
     # via -r lint-requirements.txt
-pyparsing==3.0.9
-    # via
-    #   matplotlib
-    #   packaging
-pytest==7.2.0
+pyparsing==3.1.1
+    # via matplotlib
+pyphen==0.14.0
+    # via textstat
+pytest==7.4.2
     # via
     #   -r test-requirements.txt
     #   pytest-cov
-pytest-cov==4.0.0
+    #   pytest-timeout
+pytest-cov==4.1.0
     # via -r test-requirements.txt
 pytest-localserver==0.5.0
     # via -r test-requirements.txt
+pytest-timeout==2.2.0
+    # via -r test-requirements.txt
 python-dateutil==2.8.2
     # via
     #   botocore
+    #   jupyter-client
     #   matplotlib
     #   moto
     #   pandas
-pytz==2022.6
+python-dotenv==1.0.0
+    # via mlserver
+python-rapidjson==1.12
+    # via tritonclient
+pytz==2023.3.post1
     # via
     #   -r skinny-requirements.txt
     #   babel
-    #   moto
+    #   mlflow
     #   pandas
-pyyaml==6.0
+pyyaml==6.0.1
     # via
     #   -r skinny-requirements.txt
+    #   datasets
     #   huggingface-hub
+    #   mlflow
     #   pre-commit
-    #   transformers
+pyzmq==25.1.1
+    # via
+    #   ipykernel
+    #   jupyter-client
 querystring-parser==1.2.4
-    # via -r core-requirements.txt
-regex==2022.10.31
-    # via transformers
-requests==2.28.1
+    # via
+    #   -r core-requirements.txt
+    #   mlflow
+regex==2023.10.3
+    # via
+    #   nltk
+    #   tiktoken
+requests==2.31.0
     # via
     #   -r skinny-requirements.txt
     #   azure-core
     #   databricks-cli
+    #   databricks-sdk
+    #   datasets
     #   docker
+    #   evaluate
+    #   fsspec
     #   huggingface-hub
+    #   mlflow
     #   moto
     #   msal
-    #   msrest
-    #   requests-oauthlib
     #   responses
     #   sphinx
-    #   transformers
-requests-oauthlib==1.3.1
-    # via msrest
-responses==0.22.0
-    # via moto
+    #   tiktoken
+responses==0.18.0
+    # via
+    #   evaluate
+    #   moto
 rich==12.6.0
     # via typer
+rouge-score==0.1.2
+    # via -r test-requirements.txt
 rstcheck==6.1.1
     # via -r lint-requirements.txt
 rstcheck-core==1.0.3
     # via rstcheck
-s3transfer==0.6.0
+ruff==0.0.292
+    # via -r lint-requirements.txt
+s3transfer==0.7.0
     # via boto3
-scikit-learn==1.1.3
+scikit-learn==1.3.2
     # via
     #   -r core-requirements.txt
     #   flaml
-    #   lightgbm
+    #   mlflow
     #   shap
-scipy==1.9.3
+scipy==1.10.1
     # via
     #   -r core-requirements.txt
     #   flaml
     #   hyperopt
     #   lightgbm
+    #   mlflow
     #   scikit-learn
     #   shap
     #   xgboost
-shap==0.41.0
-    # via
-    #   -r core-requirements.txt
-    #   -r test-requirements.txt
-shellingham==1.5.0
+shap==0.43.0
+    # via -r test-requirements.txt
+shellingham==1.5.4
     # via typer
 six==1.16.0
     # via
     #   asttokens
     #   azure-core
-    #   azure-identity
     #   databricks-cli
+    #   geventhttpclient
     #   hyperopt
     #   isodate
     #   livereload
     #   python-dateutil
     #   querystring-parser
+    #   rouge-score
 slicer==0.0.7
     # via shap
-smmap==5.0.0
+smmap==5.0.1
     # via gitdb
+sniffio==1.3.0
+    # via anyio
 snowballstemmer==2.2.0
     # via sphinx
 sphinx==3.5.4
     # via
-    #   -r doc-requirements.txt
+    #   -r doc-min-requirements.txt
     #   sphinx-autobuild
     #   sphinx-click
 sphinx-autobuild==2021.3.14
-    # via -r doc-requirements.txt
-sphinx-click==4.3.0
-    # via -r doc-requirements.txt
-sphinxcontrib-applehelp==1.0.2
+    # via -r doc-min-requirements.txt
+sphinx-click==5.0.1
+    # via -r doc-min-requirements.txt
+sphinxcontrib-applehelp==1.0.4
     # via sphinx
 sphinxcontrib-devhelp==1.0.2
     # via sphinx
-sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-htmlhelp==2.0.1
     # via sphinx
 sphinxcontrib-jsmath==1.0.1
     # via sphinx
@@ -421,94 +634,140 @@ sphinxcontrib-qthelp==1.0.3
     # via sphinx
 sphinxcontrib-serializinghtml==1.1.5
     # via sphinx
-sqlalchemy==1.4.44
+sqlalchemy==2.0.22
     # via
     #   -r core-requirements.txt
     #   alembic
-sqlparse==0.4.3
-    # via -r skinny-requirements.txt
-stack-data==0.6.1
+    #   mlflow
+sqlparse==0.4.4
+    # via
+    #   -r skinny-requirements.txt
+    #   mlflow
+stack-data==0.6.3
     # via ipython
+starlette==0.22.0
+    # via
+    #   fastapi
+    #   starlette-exporter
+starlette-exporter==0.16.0
+    # via mlserver
 tabulate==0.9.0
     # via databricks-cli
-tenacity==8.1.0
+tenacity==8.2.3
     # via plotly
-threadpoolctl==3.1.0
+textstat==0.7.3
+    # via -r test-requirements.txt
+threadpoolctl==3.2.0
     # via scikit-learn
-tokenize-rt==5.0.0
+tiktoken==0.5.1
+    # via -r test-requirements.txt
+tokenize-rt==5.2.0
     # via black
-tokenizers==0.13.2
-    # via transformers
 toml==0.10.2
-    # via
-    #   pre-commit
-    #   responses
+    # via pre-commit
 tomli==2.0.1
     # via
     #   black
     #   coverage
     #   pylint
     #   pytest
-tomlkit==0.11.6
+tomlkit==0.12.1
     # via pylint
 tornado==6.3.3
-    # via livereload
-tqdm==4.64.1
     # via
+    #   ipykernel
+    #   jupyter-client
+    #   livereload
+tqdm==4.66.1
+    # via
+    #   -r test-requirements.txt
+    #   datasets
+    #   evaluate
     #   huggingface-hub
     #   hyperopt
+    #   nltk
     #   shap
-    #   transformers
-traitlets==5.5.0
+traitlets==5.11.2
     # via
+    #   comm
+    #   ipykernel
     #   ipython
+    #   ipywidgets
+    #   jupyter-client
+    #   jupyter-core
     #   matplotlib-inline
-transformers==4.24.0
-    # via -r test-requirements.txt
+tritonclient[http]==2.38.0
+    # via
+    #   mlserver
+    #   tritonclient
 typer[all]==0.7.0
-    # via rstcheck
-types-docutils==0.19.1.1
+    # via
+    #   rstcheck
+    #   typer
+types-docutils==0.19.1.9
     # via rstcheck-core
-types-toml==0.10.8.1
-    # via responses
-typing-extensions==4.4.0
+typing-extensions==4.8.0
     # via
+    #   alembic
     #   astroid
     #   azure-core
+    #   azure-storage-blob
+    #   azure-storage-file-datalake
     #   black
     #   huggingface-hub
+    #   ipython
     #   pydantic
     #   pylint
     #   rich
-urllib3==1.26.12
+    #   sqlalchemy
+    #   starlette
+    #   uvicorn
+tzdata==2023.3
+    # via pandas
+urllib3==1.26.18
     # via
     #   botocore
+    #   databricks-cli
     #   docker
     #   requests
     #   responses
-virtualenv==20.16.7
+uvicorn==0.23.2
+    # via mlserver
+uvloop==0.19.0
+    # via mlserver
+virtualenv==20.24.6
     # via pre-commit
-wcwidth==0.2.5
+wcwidth==0.2.8
     # via prompt-toolkit
-websocket-client==1.4.2
+websocket-client==1.6.4
     # via docker
-werkzeug==2.2.2
+werkzeug==3.0.0
     # via
     #   flask
     #   moto
     #   pytest-localserver
-wheel==0.38.4
-    # via lightgbm
-wrapt==1.14.1
+widgetsnbextension==4.0.9
+    # via ipywidgets
+wrapt==1.15.0
     # via astroid
-xgboost==1.7.1
+xgboost==2.0.0
     # via flaml
 xmltodict==0.13.0
     # via moto
-zipp==3.10.0
+xxhash==3.4.1
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.9.2
+    # via aiohttp
+zipp==3.17.0
     # via
     #   importlib-metadata
     #   importlib-resources
+zope-event==5.0
+    # via gevent
+zope-interface==6.1
+    # via gevent
 
 # The following packages are considered to be unsafe in a requirements file:
 # pip
diff --git a/.github/ISSUE_TEMPLATE/bug_report_template.yaml b/.github/ISSUE_TEMPLATE/bug_report_template.yaml
index 4b0c0482eb260..ac1045ce4847e 100644
--- a/.github/ISSUE_TEMPLATE/bug_report_template.yaml
+++ b/.github/ISSUE_TEMPLATE/bug_report_template.yaml
@@ -17,6 +17,16 @@ body:
       options:
         - label: I have read and agree to submit bug reports in accordance with the [issues policy](https://www.github.com/mlflow/mlflow/blob/master/ISSUE_POLICY.md)
           required: true
+  - type: dropdown
+    attributes:
+      label: Where did you encounter this bug?
+      options:
+        - Local machine
+        - Databricks
+        - Azure Machine Learning
+        - Other
+    validations:
+      required: true
   - type: dropdown
     id: contribution
     attributes:
diff --git a/.github/actions/cache-pip/action.yml b/.github/actions/cache-pip/action.yml
index 64dd364b47d3c..945550fb77db5 100644
--- a/.github/actions/cache-pip/action.yml
+++ b/.github/actions/cache-pip/action.yml
@@ -11,5 +11,5 @@ runs:
       env:
         SEGMENT_DOWNLOAD_TIMEOUT_MINS: 1
       with:
-        path: ~/.venv
+        path: .venv
         key: ${{ steps.py-cache-key.outputs.key }}
diff --git a/.github/actions/py-cache-key/action.yml b/.github/actions/py-cache-key/action.yml
index 45dda7405d01b..659731dd3564f 100644
--- a/.github/actions/py-cache-key/action.yml
+++ b/.github/actions/py-cache-key/action.yml
@@ -22,5 +22,5 @@ runs:
         # Refresh cache daily
         DATE=$(date -u "+%Y%m%d")
         # Change this value to force a cache refresh
-        N=0
+        N=1
         echo "value=$RUNNER_IMAGE-$PYTHON_VERSION-$DATE-$REQUIREMENTS_HASH-$N" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/advice.js b/.github/workflows/advice.js
index 1d418bed1482d..9886d5dfa6030 100644
--- a/.github/workflows/advice.js
+++ b/.github/workflows/advice.js
@@ -29,13 +29,35 @@ module.exports = async ({ context, github }) => {
   const { user, body } = context.payload.pull_request;
   const messages = [];
 
-  const codespacesBadge = `[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/${user.login}/mlflow/pull/${issue_number}?quickstart=1)`;
-  if (body && !body.includes(codespacesBadge)) {
+  const title = "&#x1F6E0 DevTools &#x1F6E0";
+  if (body && !body.includes(title)) {
+    const codespacesBadge = `[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://codespaces.new/${user.login}/mlflow/pull/${issue_number}?quickstart=1)`;
+    const newSection = `
+<details><summary>${title}</summary>
+<p>
+
+${codespacesBadge}
+
+#### Install mlflow from this PR
+
+\`\`\`
+pip install git+https://github.com/mlflow/mlflow.git@refs/pull/${issue_number}/merge
+\`\`\`
+
+#### Checkout with GitHub CLI
+
+\`\`\`
+gh pr checkout ${issue_number}
+\`\`\`
+
+</p>
+</details>
+`.trim();
     await github.rest.pulls.update({
       owner,
       repo,
       pull_number: issue_number,
-      body: `${codespacesBadge}\n\n${body}`,
+      body: `${newSection}\n\n${body}`,
     });
   }
 
diff --git a/.github/workflows/cross-version-tests.yml b/.github/workflows/cross-version-tests.yml
index 860df0a01e017..69e17b4b68e60 100644
--- a/.github/workflows/cross-version-tests.yml
+++ b/.github/workflows/cross-version-tests.yml
@@ -41,6 +41,7 @@ defaults:
     shell: bash --noprofile --norc -exo pipefail {0}
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   PIP_EXTRA_INDEX_URL: https://download.pytorch.org/whl/cpu
 
 jobs:
@@ -197,5 +198,4 @@ jobs:
           PACKAGE_VERSION: ${{ matrix.version }}
           JOHNSNOWLABS_LICENSE_JSON: ${{ secrets.JOHNSNOWLABS_LICENSE_JSON }}
         run: |
-          export MLFLOW_HOME=$(pwd)
           ${{ matrix.run }}
diff --git a/.github/workflows/examples.yml b/.github/workflows/examples.yml
index abf92c5343cd4..f3f7926abe1d1 100644
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@@ -33,6 +33,7 @@ defaults:
     shell: bash --noprofile --norc -exo pipefail {0}
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   MLFLOW_CONDA_HOME: /usr/share/miniconda
 
 jobs:
@@ -74,7 +75,6 @@ jobs:
         env:
           SPARK_LOCAL_IP: localhost
         run: |
-          export MLFLOW_HOME=$(pwd)
           pytest tests/examples --durations=30
 
       - name: Remove conda environments
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 771802be7039d..a38597b7000c4 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -21,6 +21,7 @@ defaults:
     shell: bash --noprofile --norc -exo pipefail {0}
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   # Note miniconda is pre-installed in the virtual environments for GitHub Actions:
   # https://github.com/actions/virtual-environments/blob/main/images/linux/scripts/installers/miniconda.sh
   MLFLOW_CONDA_HOME: /usr/share/miniconda
@@ -31,7 +32,7 @@ jobs:
   lint:
     runs-on: ubuntu-latest
     timeout-minutes: 30
-    if: github.event_name == 'pull_request' && github.event.pull_request.draft == false
+    if: github.event_name != 'pull_request' && github.event.pull_request.draft == false
     steps:
       - uses: actions/checkout@v3
       - uses: ./.github/actions/untracked
@@ -45,21 +46,21 @@ jobs:
       - uses: ./.github/actions/cache-pip
       - name: Install dependencies
         run: |
-          python -m venv ~/.venv
-          . ~/.venv/bin/activate
+          python -m venv .venv
+          source .venv/bin/activate
           source ./dev/install-common-deps.sh --ml
           pip install -r requirements/lint-requirements.txt
       - uses: ./.github/actions/pipdeptree
       - name: Install pre-commit hooks
         run: |
-          . ~/.venv/bin/activate
+          source .venv/bin/activate
           pre-commit install -t pre-commit -t prepare-commit-msg
       - name: Run pre-commit
         id: pre-commit
         env:
           IS_MAINTAINER: ${{ contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association )}}
         run: |
-          . ~/.venv/bin/activate
+          source .venv/bin/activate
           pre-commit run --all-files
 
   # python-skinny tests cover a subset of mlflow functionality
@@ -87,6 +88,12 @@ jobs:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1, 2]
+        include:
+          - splits: 2
     steps:
       - uses: actions/checkout@v3
         with:
@@ -99,21 +106,21 @@ jobs:
       - uses: ./.github/actions/cache-pip
       - name: Install dependencies
         run: |
-          python -m venv ~/.venv
-          . ~/.venv/bin/activate
+          python -m venv .venv
+          source .venv/bin/activate
           source ./dev/install-common-deps.sh --ml
-          # pyspark 3.5 is incompatible with delta 2.4
-          pip install 'pyspark<3.5'
       - uses: ./.github/actions/pipdeptree
       - name: Import check
         run: |
-          . ~/.venv/bin/activate
+          source .venv/bin/activate
           python tests/check_mlflow_lazily_imports_ml_packages.py
       - name: Run tests
         run: |
-          . ~/.venv/bin/activate
+          source .venv/bin/activate
           source dev/setup-ssh.sh
-          ./dev/run-python-tests.sh
+          pytest --splits=${{ matrix.splits }} --group=${{ matrix.group }} --quiet --requires-ssh \
+            --ignore-flavors --ignore=tests/examples --ignore=tests/recipes --ignore=tests/evaluate \
+            tests
 
   database:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
@@ -160,27 +167,6 @@ jobs:
 
           test $err = 0
 
-      - name: Rebuild images with SQLAlchemy < 2.0
-        run: |
-          sed -i 's/sqlalchemy.*/sqlalchemy<2.0/g' requirements/core-requirements.txt
-          git diff
-          ./tests/db/compose.sh build --build-arg DEPENDENCIES="$(python setup.py -q dependencies)"
-      - name: Run tests
-        run: |
-          set +e
-          err=0
-          trap 'err=1' ERR
-
-          for service in $(./tests/db/compose.sh config --services | grep '^mlflow-')
-          do
-            # Set `--no-TTY` to show container logs on GitHub Actions:
-            ./tests/db/compose.sh run --rm --no-TTY $service pytest \
-              tests/store/tracking/test_sqlalchemy_store.py \
-              tests/store/model_registry/test_sqlalchemy_store.py \
-              tests/db
-          done
-
-          test $err = 0
       - name: Clean up
         run: |
           ./tests/db/compose.sh down --volumes --remove-orphans --rmi all
@@ -220,14 +206,18 @@ jobs:
       - uses: ./.github/actions/cache-pip
       - name: Install dependencies
         run: |
-          python -m venv ~/.venv
-          . ~/.venv/bin/activate
+          python -m venv .venv
+          source .venv/bin/activate
           source ./dev/install-common-deps.sh --ml
       - uses: ./.github/actions/pipdeptree
       - name: Run tests
         run: |
-          . ~/.venv/bin/activate
-          ./dev/run-python-flavor-tests.sh;
+          source .venv/bin/activate
+          pytest \
+            tests/utils/test_model_utils.py \
+            tests/tracking/fluent/test_fluent_autolog.py \
+            tests/autologging \
+            tests/server/auth
 
   # It takes 9 ~ 10 minutes to run tests in `tests/models`. To make CI finish faster,
   # run these tests in a separate job.
@@ -235,6 +225,12 @@ jobs:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1, 2]
+        include:
+          - splits: 2
     steps:
       - uses: actions/checkout@v3
         with:
@@ -251,8 +247,7 @@ jobs:
       - uses: ./.github/actions/pipdeptree
       - name: Run tests
         run: |
-          export MLFLOW_HOME=$(pwd)
-          pytest tests/models
+          pytest --splits=${{ matrix.splits }} --group=${{ matrix.group }} tests/models
 
   # NOTE: numpy is pinned in this suite due to its heavy reliance on shap, which internally uses
   # references to the now fully deprecated (as of 1.24.x) numpy types (i.e., np.bool).
@@ -277,13 +272,18 @@ jobs:
       - uses: ./.github/actions/pipdeptree
       - name: Run tests
         run: |
-          export MLFLOW_HOME=$(pwd)
-          pytest tests/evaluate
+          dev/pytest.sh tests/evaluate
 
   pyfunc:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1, 2]
+        include:
+          - splits: 2
     steps:
       - uses: actions/checkout@v3
         with:
@@ -300,8 +300,8 @@ jobs:
       - uses: ./.github/actions/pipdeptree
       - name: Run tests
         run: |
-          export MLFLOW_HOME=$(pwd)
-          pytest --durations=30 tests/pyfunc --ignore tests/pyfunc/test_spark_connect.py
+          pytest --splits=${{ matrix.splits }} --group=${{ matrix.group }} --durations=30 \
+            tests/pyfunc --ignore tests/pyfunc/test_spark_connect.py
 
           # test_spark_connect.py fails if it's run with ohter tests, so run it separately.
           pytest tests/pyfunc/test_spark_connect.py
@@ -323,12 +323,18 @@ jobs:
       - uses: ./.github/actions/pipdeptree
       - name: Run tests
         run: |
-          ./dev/run-python-sagemaker-tests.sh;
+          pytest tests/sagemaker
 
   windows:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     runs-on: windows-latest
     timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1, 2]
+        include:
+          - splits: 2
     steps:
       - uses: actions/checkout@v3
         with:
@@ -337,13 +343,15 @@ jobs:
       - uses: ./.github/actions/setup-python
       - uses: ./.github/actions/setup-pyenv
       - uses: ./.github/actions/setup-java
+      - uses: ./.github/actions/cache-pip
       - name: Install python dependencies
         run: |
+          python -m venv .venv
+          source .venv/Scripts/activate
           pip install -r requirements/test-requirements.txt
           pip install --no-dependencies tests/resources/mlflow-test-plugin
           pip install -e .[extras]
-          # pyspark 3.5 is incompatible with delta 2.4
-          pip install 'pyspark<3.5'
+          pip install pyspark
           pip install mleap
           # Install Hugging Face datasets to test Hugging Face usage with MLflow dataset tracking
           pip install datasets
@@ -363,11 +371,14 @@ jobs:
           # it's explicitly disposed.
           MLFLOW_SQLALCHEMYSTORE_POOLCLASS: "NullPool"
         run: |
+          source .venv/Scripts/activate
           # Set Hadoop environment variables required for testing Spark integrations on Windows
           export HADOOP_HOME=/tmp/winutils/hadoop-3.2.2
           export PATH=$PATH:$HADOOP_HOME/bin
           # Run Windows tests
-          pytest --ignore-flavors --ignore=tests/projects --ignore=tests/examples tests --ignore=tests/recipes --ignore=tests/evaluate
+          pytest --splits=${{ matrix.splits }} --group=${{ matrix.group }} \
+            --ignore-flavors --ignore=tests/projects --ignore=tests/examples --ignore=tests/recipes --ignore=tests/evaluate \
+            tests
           # MLeap is incompatible on Windows with PySpark3.4 release. 
           # Reinstate tests when MLeap has released a fix. [ML-30491]
           # pytest tests/mleap
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
index db9bc24e62397..0e629a0773cbb 100644
--- a/.github/workflows/r.yml
+++ b/.github/workflows/r.yml
@@ -18,6 +18,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION: python
 
 jobs:
@@ -90,5 +91,4 @@ jobs:
           LINTR_COMMENT_BOT: false
         run: |
           cd tests
-          export MLFLOW_HOME=$(git rev-parse --show-toplevel)
           Rscript -e 'source("../.run-tests.R", echo=TRUE)'
diff --git a/.github/workflows/recipe-template.yml b/.github/workflows/recipe-template.yml
index 6bb23e4624cfc..5f16b6a456e1e 100644
--- a/.github/workflows/recipe-template.yml
+++ b/.github/workflows/recipe-template.yml
@@ -55,9 +55,15 @@ jobs:
       - name: Install dependencies
         run: |
           pip install -r ${{ github.event.inputs.repository }}/requirements/lint-requirements.txt
-      - name: Run lint checks
+      - name: Install pre-commit hooks
         run: |
-          cd ${{ github.event.inputs.repository }} && ../../dev/lint.sh
+          cd ${{ github.event.inputs.repository }}
+          pre-commit install -t pre-commit -t prepare-commit-msg
+      - name: Run pre-commit
+        id: pre-commit
+        run: |
+          cd ${{ github.event.inputs.repository }}
+          pre-commit run --all-files
   recipe:
     runs-on: ubuntu-latest
     timeout-minutes: 120
diff --git a/.github/workflows/recipe.yml b/.github/workflows/recipe.yml
index 865c1a945e33c..a3565e42e72f3 100644
--- a/.github/workflows/recipe.yml
+++ b/.github/workflows/recipe.yml
@@ -21,6 +21,7 @@ defaults:
     shell: bash --noprofile --norc -exo pipefail {0}
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   # Note miniconda is pre-installed in the virtual environments for GitHub Actions:
   # https://github.com/actions/virtual-environments/blob/main/images/linux/scripts/installers/miniconda.sh
   MLFLOW_CONDA_HOME: /usr/share/miniconda
@@ -41,17 +42,20 @@ jobs:
       - name: Install dependencies
         run: |
           source ./dev/install-common-deps.sh
-          pip install -e .
-          # pyspark 3.5 is incompatible with delta 2.4
-          pip install 'pyspark<3.5'
+          pip install pyspark
       - name: Run tests
         run: |
-          export MLFLOW_HOME=$(pwd)
           pytest tests/recipes
 
   recipes-windows:
     if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
     runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        group: [1, 2]
+        include:
+          - splits: 2
     steps:
       - uses: actions/checkout@v3
         with:
@@ -64,8 +68,7 @@ jobs:
           pip install -r requirements/test-requirements.txt
           pip install --no-dependencies tests/resources/mlflow-test-plugin
           pip install -e .
-          # pyspark 3.5 is incompatible with delta 2.4
-          pip install 'pyspark<3.5'
+          pip install pyspark
           # TODO: Importing datasets in a pandas UDF (created by mlflow.pyfunc.spark_udf) crashes
           # the Python worker. To avoid this, uninstall `datasets`. This is a temporary workaround.
           pip uninstall -y datasets
@@ -81,6 +84,5 @@ jobs:
           # Set Hadoop environment variables required for testing Spark integrations on Windows
           export HADOOP_HOME=/tmp/winutils/hadoop-3.2.2
           export PATH=$PATH:$HADOOP_HOME/bin
-          # Run recipes tests
           export MLFLOW_HOME=$(pwd)
-          pytest tests/recipes
+          pytest --splits ${{ matrix.splits }} --group ${{ matrix.group }} tests/recipes
diff --git a/.github/workflows/requirements.yml b/.github/workflows/requirements.yml
index 679145e25a0c3..cc990b41b1ffa 100644
--- a/.github/workflows/requirements.yml
+++ b/.github/workflows/requirements.yml
@@ -18,6 +18,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
+  MLFLOW_HOME: /home/runner/work/mlflow/mlflow
   MLFLOW_CONDA_HOME: /usr/share/miniconda
   SPARK_LOCAL_IP: localhost
   PYTHON_VERSION: "3.8"
@@ -79,4 +80,5 @@ jobs:
       - name: Run tests
         run: |
           source dev/setup-ssh.sh
-          ./dev/run-python-tests.sh
+          pytest tests --quiet --requires-ssh --ignore-flavors \
+            --ignore=tests/examples --ignore=tests/recipes --ignore=tests/evaluate
diff --git a/.gitignore b/.gitignore
index f448ebaf4f360..633f40b2f37ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -104,6 +104,7 @@ travis_wait*.log
 lightning_logs
 
 a.py
+a.ipynb
 a.md
 
 # Log file created by pre-commit hook for black
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 95ebe516c1669..ba366f2648353 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -11,6 +11,7 @@ We welcome community contributions to MLflow. This page provides useful informat
   - [Write designs for significant changes](#write-designs-for-significant-changes)
   - [Make changes backwards compatible](#make-changes-backwards-compatible)
   - [Consider introducing new features as MLflow Plugins](#consider-introducing-new-features-as-mlflow-plugins)
+  - [Python Style Guide](#python-style-guide)
 - [Setting up the repository](#setting-up-the-repository)
 - [Developing and testing MLflow](#developing-and-testing-mlflow)
   - [Environment Setup and Python configuration](#environment-setup-and-python-configuration)
@@ -180,6 +181,35 @@ base](https://github.com/mlflow/mlflow/blob/cdc6a651d5af0f29bd448d2c87a198cf5d32
 For more information about Plugins, see
 <https://mlflow.org/docs/latest/plugins.html>.
 
+### Python Style Guide
+
+##### Docstrings
+
+We follow [Google's Python Style Guide](https://google.github.io/styleguide/pyguide.html)
+for writing docstrings. Make sure your docstrings adhere to this style
+guide.
+
+The process for converting to a standard docstring format style is  
+ongoing. If you see a docstring in the code base that doesn't adhere
+to this formatting style and you'd like to contribute a fix, feel free
+to open a PR to correct the docstring formatting.
+
+###### Code Style
+
+We use [pylint](https://pypi.org/project/pylint/),
+[black](https://black.readthedocs.io/en/stable/the_black_code_style/index.html),
+and [ruff](https://github.com/astral-sh/ruff) in our CI via
+pre-commit Git hooks. If your code passes the CI checks, it's
+formatted correctly.
+
+To validate that your local versions of the above libraries
+match those in the mlflow CI, refer to [lint-requirements.txt](https://github.com/mlflow/mlflow/blob/master/requirements/lint-requirements.txt).
+You can compare these versions with your local using pip:
+
+```bash
+pip show pylint
+```
+
 ## Setting up the repository
 
 To set up the MLflow repository, run the following commands:
@@ -580,8 +610,9 @@ Then, verify that the unit tests & linter pass before submitting a pull
 request by running:
 
 ```bash
-./dev/lint.sh
-./dev/run-python-tests.sh
+pre-commit run --all-files
+pytest tests --quiet --requires-ssh --ignore-flavors \
+  --ignore=tests/examples --ignore=tests/recipes --ignore=tests/evaluate
 ```
 
 We use [pytest](https://docs.pytest.org/en/latest/contents.html) to run
@@ -612,7 +643,7 @@ If you are adding new framework flavor support, you'll need to modify
 `pytest` and Github action configurations so tests for your code can run
 properly. Generally, the files you'll have to edit are:
 
-1.  `dev/run-python-tests.sh`:
+1.  `.github/workflows/master.yml`: lines where pytest runs with `--ignore-flavors` flag
 
     1. Add your tests to the ignore list, where the other frameworks are
        ignored
diff --git a/conftest.py b/conftest.py
index 61ae4b720edd3..a006a1c80f298 100644
--- a/conftest.py
+++ b/conftest.py
@@ -27,6 +27,18 @@ def pytest_addoption(parser):
         default=False,
         help="Ignore tests for model flavors.",
     )
+    parser.addoption(
+        "--splits",
+        default=None,
+        type=int,
+        help="The number of groups to split tests into.",
+    )
+    parser.addoption(
+        "--group",
+        default=None,
+        type=int,
+        help="The group of tests to run.",
+    )
 
 
 def pytest_configure(config):
@@ -36,6 +48,29 @@ def pytest_configure(config):
     config.addinivalue_line("markers", "allow_infer_pip_requirements_fallback")
 
 
+@pytest.hookimpl(tryfirst=True)
+def pytest_cmdline_main(config):
+    group = config.getoption("group")
+    splits = config.getoption("splits")
+
+    if splits is None and group is None:
+        return None
+
+    if splits and group is None:
+        raise pytest.UsageError("`--group` is required")
+
+    if group and splits is None:
+        raise pytest.UsageError("`--splits` is required")
+
+    if splits < 0:
+        raise pytest.UsageError("`--splits` must be >= 1")
+
+    if group < 1 or group > splits:
+        raise pytest.UsageError("`--group` must be between 1 and {splits}")
+
+    return None
+
+
 def pytest_sessionstart(session):
     if uri := MLFLOW_TRACKING_URI.get():
         click.echo(
@@ -55,6 +90,35 @@ def pytest_runtest_setup(item):
         pytest.skip("use `--requires-ssh` to run this test")
 
 
+@pytest.hookimpl(hookwrapper=True)
+def pytest_report_teststatus(report, config):
+    outcome = yield
+    if report.when == "call":
+        try:
+            import psutil
+        except ImportError:
+            return
+
+        (*rest, result) = outcome.get_result()
+        mem = psutil.virtual_memory()
+        mem_used = mem.used / 1024**3
+        mem_total = mem.total / 1024**3
+
+        disk = psutil.disk_usage("/")
+        disk_used = disk.used / 1024**3
+        disk_total = disk.total / 1024**3
+        outcome.force_result(
+            (
+                *rest,
+                (
+                    f"{result} | "
+                    f"MEM {mem_used:.1f}/{mem_total:.1f} GB | "
+                    f"DISK {disk_used:.1f}/{disk_total:.1f} GB"
+                ),
+            )
+        )
+
+
 @pytest.hookimpl(hookwrapper=True)
 def pytest_ignore_collect(path, config):
     outcome = yield
@@ -118,6 +182,7 @@ def pytest_ignore_collect(path, config):
             outcome.force_result(True)
 
 
+@pytest.hookimpl(trylast=True)
 def pytest_collection_modifyitems(session, config, items):  # pylint: disable=unused-argument
     # Executing `tests.server.test_prometheus_exporter` after `tests.server.test_handlers`
     # results in an error because Flask >= 2.2.0 doesn't allow calling setup method such as
@@ -125,6 +190,10 @@ def pytest_collection_modifyitems(session, config, items):  # pylint: disable=un
     # execute `tests.server.test_prometheus_exporter` first by reordering the test items.
     items.sort(key=lambda item: item.module.__name__ != "tests.server.test_prometheus_exporter")
 
+    # Select the tests to run based on the group and splits
+    if (splits := config.getoption("--splits")) and (group := config.getoption("--group")):
+        items[:] = items[(group - 1) :: splits]
+
 
 @pytest.hookimpl(hookwrapper=True)
 def pytest_terminal_summary(
diff --git a/dev/dev-env-setup.sh b/dev/dev-env-setup.sh
index 6efd841e58537..fbb2616934faf 100755
--- a/dev/dev-env-setup.sh
+++ b/dev/dev-env-setup.sh
@@ -21,7 +21,7 @@ This script will:
 
   Example usage:
 
-  From root of MLflow repository on local with a destination virtualenv path of <MLFLOW_HOME>/.venvs/mlflow-dev:
+  From root of MLflow repository on local with a destination virtualenv path of <REPO_ROOT>/.venvs/mlflow-dev:
 
   dev/dev-env-setup.sh -d $(pwd)/.venvs/mlflow-dev
 
@@ -151,8 +151,8 @@ if [ -z "$pyenv_exist" ]; then
   fi
 fi
 
-MLFLOW_HOME=$(pwd)
-rd="$MLFLOW_HOME/requirements"
+REPO_ROOT=$(git rev-parse --show-toplevel)
+rd="$REPO_ROOT/requirements"
 
 # Get the minimum supported version for development purposes
 min_py_version="3.8"
diff --git a/dev/get_minimum_required_python.py b/dev/get_minimum_required_python.py
index 0530e68594a8e..db022ef83d763 100644
--- a/dev/get_minimum_required_python.py
+++ b/dev/get_minimum_required_python.py
@@ -5,14 +5,14 @@
 python dev/get_minimum_required_python.py -p scikit-learn -v 1.1.0 --python-versions "3.8"
 """
 import argparse
-import typing as t
+from typing import Optional
 
 import requests
 from packaging.specifiers import SpecifierSet
 from packaging.version import Version
 
 
-def get_requires_python(package: str, version: str) -> t.Optional[str]:
+def get_requires_python(package: str, version: str) -> Optional[str]:
     resp = requests.get(f"https://pypi.python.org/pypi/{package}/json")
     resp.raise_for_status()
     return next(
diff --git a/dev/install-common-deps.sh b/dev/install-common-deps.sh
index 2134f8dad20ac..052deb10a16af 100755
--- a/dev/install-common-deps.sh
+++ b/dev/install-common-deps.sh
@@ -52,7 +52,6 @@ if [[ "$SKINNY" == "true" ]]; then
 else
   pip install .[extras] --upgrade
 fi
-export MLFLOW_HOME=$(pwd)
 
 req_files=""
 # Install Python test dependencies only if we're running Python tests
@@ -77,7 +76,6 @@ pip install --no-dependencies tests/resources/mlflow-test-plugin
 pip install aiohttp
 python dev/show_package_release_dates.py
 which mlflow
-echo $MLFLOW_HOME
 
 # Print mlflow version
 mlflow --version
diff --git a/dev/pytest.sh b/dev/pytest.sh
new file mode 100755
index 0000000000000..a6d8a61b65c41
--- /dev/null
+++ b/dev/pytest.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+# pytest runner to fail fast if the "fail-fast" label is present on the PR.
+
+fetch_labels() {
+    if [ -z $GITHUB_ACTIONS ]; then
+        echo ""
+        return
+    fi
+
+    if [ "$GITHUB_EVENT_NAME" != "pull_request" ]; then
+        echo ""
+        return
+    fi
+
+    PR_DATA=$(cat $GITHUB_EVENT_PATH)
+    PR_NUMBER=$(echo $PR_DATA | jq --raw-output .pull_request.number)
+    LABELS=$(curl -s https://api.github.com/repos/$GITHUB_REPOSITORY/issues/$PR_NUMBER/labels | jq --raw-output .[].name)
+    echo $LABELS
+}
+
+main() {
+    LABELS=$(fetch_labels)
+
+    if [[ $LABELS == *"fail-fast"* ]]; then
+        EXTRA_OPTIONS="--exitfirst"
+    fi
+
+    echo "pytest $EXTRA_OPTIONS ${@:1}"
+    pytest $EXTRA_OPTIONS "${@:1}"
+}
+
+main "$@"
diff --git a/dev/run-python-flavor-tests.sh b/dev/run-python-flavor-tests.sh
deleted file mode 100755
index b21e3e3a342f3..0000000000000
--- a/dev/run-python-flavor-tests.sh
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-set -x
-
-export MLFLOW_HOME=$(pwd)
-
-pytest \
-  tests/utils/test_model_utils.py \
-  tests/tracking/fluent/test_fluent_autolog.py \
-  tests/autologging \
-  tests/server/auth
diff --git a/dev/run-python-sagemaker-tests.sh b/dev/run-python-sagemaker-tests.sh
deleted file mode 100755
index f7b0cc5c3eeda..0000000000000
--- a/dev/run-python-sagemaker-tests.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-export MLFLOW_HOME=$(pwd)
-
-pytest tests/sagemaker
diff --git a/dev/run-python-tests.sh b/dev/run-python-tests.sh
deleted file mode 100755
index 807ee7c777d42..0000000000000
--- a/dev/run-python-tests.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/env bash
-set -x
-# Set err=1 if any commands exit with non-zero status as described in
-# https://stackoverflow.com/a/42219754
-err=0
-trap 'err=1' ERR
-export MLFLOW_HOME=$(pwd)
-
-pytest tests --quiet --requires-ssh --ignore-flavors --ignore=tests/examples --ignore=tests/recipes --ignore=tests/evaluate
-
-test $err = 0
diff --git a/dev/set_matrix.py b/dev/set_matrix.py
index 4cc2a604eeebd..837e42ab43fc9 100644
--- a/dev/set_matrix.py
+++ b/dev/set_matrix.py
@@ -451,7 +451,7 @@ def main(args):
     matrix = generate_matrix(args)
     is_matrix_empty = len(matrix) == 0
     matrix = sorted(matrix, key=lambda x: (x.name, x.category, x.version))
-    matrix = [x for x in matrix if x.flavor != "gluon"]
+    matrix = [x for x in matrix if x.flavor not in ("gluon", "mleap")]
     matrix = {"include": matrix, "job_name": [x.job_name for x in matrix]}
 
     print(divider("Matrix"))
diff --git a/dev/test-dev-env-setup.sh b/dev/test-dev-env-setup.sh
index 623c089de13fd..bb688c000a017 100755
--- a/dev/test-dev-env-setup.sh
+++ b/dev/test-dev-env-setup.sh
@@ -15,13 +15,13 @@ set -x
 
 err=0
 
-MLFLOW_HOME=$(pwd)
-export MLFLOW_HOME
+REPO_ROOT=$(git rev-parse --show-toplevel)
+export REPO_ROOT
 
 # Run the installation of the environment
-DEV_DIR=$MLFLOW_HOME/.venvs/mlflow-dev
+DEV_DIR=$REPO_ROOT/.venvs/mlflow-dev
 
-"$MLFLOW_HOME"/dev/dev-env-setup.sh -d "$DEV_DIR" -f
+"$REPO_ROOT"/dev/dev-env-setup.sh -d "$DEV_DIR" -f
 
 source "$DEV_DIR/bin/activate"
 
diff --git a/docs/source/_static/images/llm_evaluate_experiment_view.png b/docs/source/_static/images/llm_evaluate_experiment_view.png
new file mode 100644
index 0000000000000..17278ca6f13f0
Binary files /dev/null and b/docs/source/_static/images/llm_evaluate_experiment_view.png differ
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b4528cc46af77..291d0af171eb4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -35,6 +35,7 @@ Get started using the :ref:`quickstart` or by reading about the :ref:`key concep
     recipes
     gateway/index
     llms/prompt-engineering
+    llms/llm-evaluate/index
     plugins
     auth/index
     cli
diff --git a/docs/source/llms/llm-evaluate/index.rst b/docs/source/llms/llm-evaluate/index.rst
new file mode 100644
index 0000000000000..80fd49eb77d83
--- /dev/null
+++ b/docs/source/llms/llm-evaluate/index.rst
@@ -0,0 +1,537 @@
+.. _llm-eval:
+
+MLflow LLM Evaluate
+====================================
+
+With the emerging of ChatGPT, LLMs have shown its power of text generation in various fields, such as 
+question answering, translating and text summarization. Evaluating LLMs' performance is slightly different 
+from traditional ML models, as very often there is no single ground truth to compare against. 
+MLflow provides an API :py:func:`mlflow.evaluate()` to help evaluate your LLMs.
+
+MLflow's LLM evaluation functionality consists of 3 main components:
+
+1. **A model to evaluate**: it can be an MLflow ``pyfunc`` model, a URI pointing to one registered 
+   MLflow model, or any python callable that represents your model, e.g, a HuggingFace text summarization pipeline. 
+2. **Metrics**: the metrics to compute, LLM evaluate will use LLM metrics. 
+3. **Evaluation data**: the data your model is evaluated at, it can be a pandas Dataframe, a python list, a 
+   numpy array or an :py:func:`mlflow.data.dataset.Dataset` instance.
+
+
+Quickstart
+==========
+
+Below is a simple example that gives an quick overview of how MLflow LLM evaluation works. The example builds
+a simple question-answering model by wrapping "openai/gpt-4" with custom prompt. You can paste it to
+your IPython or local editor and execute it, and install missing dependencies as prompted. Running the code 
+requires OpenAI API key, if you don't have an OpenAI key, you can set it up [here](https://platform.openai.com/account/api-keys).
+
+.. code-block:: shell
+
+    export OPENAI_API_KEY='your-api-key-here'
+
+.. code-block:: python
+
+    import mlflow
+    import openai
+    import os
+    import pandas as pd
+    from getpass import getpass
+
+    eval_data = pd.DataFrame(
+        {
+            "inputs": [
+                "What is MLflow?",
+                "What is Spark?",
+            ],
+            "ground_truth": [
+                "MLflow is an open-source platform for managing the end-to-end machine learning (ML) "
+                "lifecycle. It was developed by Databricks, a company that specializes in big data and "
+                "machine learning solutions. MLflow is designed to address the challenges that data "
+                "scientists and machine learning engineers face when developing, training, and deploying "
+                "machine learning models.",
+                "Apache Spark is an open-source, distributed computing system designed for big data "
+                "processing and analytics. It was developed in response to limitations of the Hadoop "
+                "MapReduce computing model, offering improvements in speed and ease of use. Spark "
+                "provides libraries for various tasks such as data ingestion, processing, and analysis "
+                "through its components like Spark SQL for structured data, Spark Streaming for "
+                "real-time data processing, and MLlib for machine learning tasks",
+            ],
+        }
+    )
+
+    with mlflow.start_run() as run:
+        system_prompt = "Answer the following question in two sentences"
+        # Wrap "gpt-4" as an MLflow model.
+        logged_model_info = mlflow.openai.log_model(
+            model="gpt-4",
+            task=openai.ChatCompletion,
+            artifact_path="model",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": "{question}"},
+            ],
+        )
+
+        # Use predefined question-answering metrics to evaluate our model.
+        results = mlflow.evaluate(
+            logged_model_info.model_uri,
+            eval_data,
+            targets="ground_truth",
+            model_type="question-answering",
+        )
+        print(f"See aggregated evaluation results below: \n{results.metrics}")
+
+        # Evaluation result for each data record is available in `results.tables`.
+        eval_table = results.tables["eval_results_table"]
+        print(f"See evaluation table below: \n{eval_table}")
+
+
+LLM Evaluation Metrics
+=======================
+
+There are two types of LLM evaluation metrics in MLflow:
+
+1. Metrics relying on SaaS model (e.g., OpenAI) for scoring, e.g., :py:func:`mlflow.metrics.answer_relevance`. These  
+   metrics are created via :py:func:`mlflow.metrics.make_genai_metric` method. For each data record, these metrics under the hood sends 
+   one prompt consisting of the following information to the SaaS model, and extract the score from model response:
+
+   * Metrics definition.
+   * Metrics grading criteria.
+   * Reference examples.
+   * Input data/context.
+   * Model output.
+   * [optional] Ground truth.
+
+   More details of how these fields are set can be found in the section "Create your Custom LLM-evaluation Metrics".
+
+2. Function-based per-row metrics. These metrics calculate a score for each data record (row in terms of Pandas/Spark dataframe),
+   based on certain functions, like Rouge (:py:func:`mlflow.metrics.rougeL`) or Flesch Kincaid (:py:func:`mlflow.metrics.flesch_kincaid_grade_level`). 
+   These metrics are similar to traditional metrics.
+
+
+Select Metrics to Evaluate
+--------------------------
+
+MLflow LLM evaluation includes default collections of metrics for pre-selected tasks, e.g, "question-answering". Depending on the 
+LLM use case that you are evaluating, these pre-defined collections can greatly simplify the process of running evaluations. 
+The default metrics for given model types are shown below:
+
+* **question-answering**: ``model_type="question-answering"``:
+
+    * exact-match
+    * `toxicity <https://huggingface.co/spaces/evaluate-measurement/toxicity>`_ :sup:`1`
+    * `ari_grade_level <https://en.wikipedia.org/wiki/Automated_readability_index>`_ :sup:`2`
+    * `flesch_kincaid_grade_level <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level>`_ :sup:`2`
+
+* **text-summarization**: ``model_type="text-summarization"``: 
+
+    * `ROUGE <https://huggingface.co/spaces/evaluate-metric/rouge>`_ :sup:`3`
+    * `toxicity <https://huggingface.co/spaces/evaluate-measurement/toxicity>`_ :sup:`1`
+    * `ari_grade_level <https://en.wikipedia.org/wiki/Automated_readability_index>`_ :sup:`2`
+    * `flesch_kincaid_grade_level <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level>`_ :sup:`2`
+
+* **text models**: ``model_type="text"``:
+
+    * `toxicity <https://huggingface.co/spaces/evaluate-measurement/toxicity>`_ :sup:`1`
+    * `ari_grade_level <https://en.wikipedia.org/wiki/Automated_readability_index>`_ :sup:`2`
+    * `flesch_kincaid_grade_level <https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level>`_ :sup:`2`
+
+
+:sup:`1` Requires package `evaluate <https://pypi.org/project/evaluate>`_, `pytorch <https://pytorch.org/get-started/locally/>`_, and 
+`transformers <https://huggingface.co/docs/transformers/installation>`_
+
+:sup:`2` Requires package `textstat <https://pypi.org/project/textstat>`_
+
+:sup:`3` Requires package `evaluate <https://pypi.org/project/evaluate>`_, `nltk <https://pypi.org/project/nltk>`_, and 
+`rouge-score <https://pypi.org/project/rouge-score>`_
+
+However, using the pre-defined metrics associated with a given model type is not the only way to generate scoring metrics 
+for LLM evaluation in MLFlow. MLflow provides two ways for selecting metrics to evluate your LLM:
+
+1. Specify the ``model_type`` argument in :py:func:`mlflow.evaluate` 
+
+    * Each predefined model type comes with a standard set of metrics that are available for relevant evaluation of a model type. 
+    * The defaults are suitable if your model falls in one of the predefined categories (e.g., ``question-answering``).   
+
+    An example of using the predefined metrics for a given ``model_type`` is shown below:
+
+    .. code-block:: python
+
+        results = mlflow.evaluate(
+            model,
+            eval_data,
+            targets="ground_truth",
+            model_type="question-answering",
+        )
+
+2. Specify a custom list of metrics by explicitly referencing a metric calculation function.
+
+    * To add additional metrics to the default collection from part 1 above, add the function names to the ``extra_metrics`` argument.
+    * To diable default metric calculation and only calculate explicit metrics, remove the ``model_type`` argument and define the desired metrics. 
+
+    An example of disabling the default metrics and explicitly declaring a subset of metrics to calculate is shown below:
+
+    .. code-block:: python
+
+        results = mlflow.evaluate(
+            model,
+            eval_data,
+            targets="ground_truth",
+            extra_metrics=[mlflow.metrics.toxicity(), mlflow.metrics.latency()],
+        )
+
+
+The full reference for supported evaluation metrics can be found `here <../python_api/mlflow.html#mlflow.evaluate>`_. 
+
+Metrics with LLM as the Judge
+---------------------------------------------
+
+MLflow offers a few pre-canned metrics which uses LLM as the judge. Despite the difference under the hood, the usage
+is the same - put these metrics in the ``extra_metrics`` argument in ``mlflow.evaluate()``. Here is the list of pre-canned
+metrics:
+
+* :py:func:`mlflow.metrics.answer_similarity`: Evaluate the similarity between ground truth and your LLM outputs.
+* :py:func:`mlflow.metrics.answer_correctness`: Evaluate the correctness level of your LLM outputs based on given context
+  and ground truth.
+* :py:func:`mlflow.metrics.answer_relevance`: Evaluate the appropriateness and applicability of the output with 
+  respect to the input. 
+* :py:func:`mlflow.metrics.faithfulness`: Evaluate the faithfulness of your LLM outputs. 
+
+
+Create your Custom LLM-evaluation Metrics
+---------------------------------------------
+
+Create LLM-as-judge Evaluation Metrics (Category 1)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can also create your own Saas LLM evaluation metrics with MLflow API :py:func:`mlflow.metrics.make_genai_metric`, which 
+needs the following information:
+
+* ``name``: the name of your custom metric.
+* ``definition``: describe what's the metric doing. 
+* ``grading_prompt``: describe the scoring critieria. 
+* ``examples``: a few input/output examples with score, they are used as a reference for LLM judge.
+* ``model``: the identifier of LLM judge. 
+* ``parameters``: the extra parameters to send to LLM judge, e.g., ``temperature`` for ``"openai:/gpt-3.5-turbo-16k"``.
+* ``aggregations``: aggregation strategy for the metrics.
+* ``greater_is_better``: indicates if a higher score means your model is better.
+
+Under the hood, ``definition``, ``grading_prompt``, ``examples`` together with evaluation data and model output will be 
+composed into a long prompt and sent to LLM. If you are familiar with the concept of prompt engineering, 
+SaaS LLM evaluation metric is basically trying to compose a "right" prompt containing instructions, data and model 
+output so that LLM, e.g., GPT4 can output the information we want. 
+
+Now let's create a custom GenAI metrics called "professionalism", which measures how professional our model output is. 
+
+Let's first create a few examples with scores, these will be the reference samples LLM judge uses. To create such examples, 
+we will use :py:func:`mlflow.metrics.EvaluationExample` class, which has 4 fields:
+
+* input: input text.
+* output: output text.
+* score: the score for output in the context of input. 
+* justification: why do we give the `score` for the data. 
+
+.. code-block:: python
+
+    professionalism_example_score_2 = mlflow.metrics.EvaluationExample(
+        input="What is MLflow?",
+        output=(
+            "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps "
+            "you track experiments, package your code and models, and collaborate with your team, making the whole ML "
+            "workflow smoother. It's like your Swiss Army knife for machine learning!"
+        ),
+        score=2,
+        justification=(
+            "The response is written in a casual tone. It uses contractions, filler words such as 'like', and "
+            "exclamation points, which make it sound less professional. "
+        ),
+    )
+    professionalism_example_score_4 = mlflow.metrics.EvaluationExample(
+        input="What is MLflow?",
+        output=(
+            "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was "
+            "developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is "
+            "designed to address the challenges that data scientists and machine learning engineers face when "
+            "developing, training, and deploying machine learning models.",
+        ),
+        score=4,
+        justification=("The response is written in a formal language and a neutral tone. "),
+    )
+
+Now let's define the ``professionalism`` metric, you will see how each field is set up.
+
+.. code-block:: python
+
+    professionalism = mlflow.metrics.make_genai_metric(
+        name="professionalism",
+        definition=(
+            "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is "
+            "tailored to the context and audience. It often involves avoiding overly casual language, slang, or "
+            "colloquialisms, and instead using clear, concise, and respectful language."
+        ),
+        grading_prompt=(
+            "Professionalism: If the answer is written using a professional tone, below are the details for different scores: "
+            "- Score 0: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for "
+            "professional contexts."
+            "- Score 1: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in "
+            "some informal professional settings."
+            "- Score 2: Language is overall formal but still have casual words/phrases. Borderline for professional contexts."
+            "- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. "
+            "- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for formal "
+            "business or academic settings. "
+        ),
+        examples=[professionalism_example_score_2, professionalism_example_score_4],
+        model="openai:/gpt-3.5-turbo-16k",
+        parameters={"temperature": 0.0},
+        aggregations=["mean", "variance"],
+        greater_is_better=True,
+    )
+
+..
+    TODO(prithvi): add best practice for creating GenAI metrics.
+
+
+Create Per-row LLM Evluation Metrics (Category 2)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This is very similar to creating a custom traditional metrics, with the exception of returning a `EvaluationResult` instance.
+Basically you need to:
+
+1. Implement a ``eval_fn`` to define your scoring logic, it must take in 3 args ``predictions``, ``targets`` and ``metrics``.
+   ``eval_fn`` must return a :py:func:`mlflow.metrics.MetricValue` instance.
+2. Pass ``eval_fn`` and other arguments to ``mlflow.metricsmake_metric`` API to create the metric. 
+
+The following code creates a dummy per-row metric called ``"over_10_chars"``: if the model output is greater than 10, 
+the score is 1 otherwise 0.
+
+.. code-block:: python
+
+    def eval_fn(predictions, targets, metrics):
+        scores = []
+        for i in range(len(predictions)):
+            if len(predictions[i]) > 10:
+                scores.append(1)
+            else:
+                scores.append(0)
+        return MetricValue(
+            scores=scores,
+            aggregate_results=standard_aggregations(scores),
+        )
+
+
+    # Create an EvaluationMetric object.
+    passing_code_metric = make_metric(
+        eval_fn=eval_fn, greater_is_better=False, name="over_10_chars"
+    )
+
+
+Prepare Your LLM for Evaluating
+=====================================
+
+In order to evaluate your LLM with ``mlflow.evaluate()``, your LLM has to be one of the following type:
+
+1. A :py:func:`mlflow.pyfunc.PyFuncModel` instance or a URI pointing to a logged `mlflow.pyfunc.PyFuncModel` model. In
+   general we call that MLflow model. The 
+2. A python function that takes in string inputs and outputs a single string. Your callable must match the signature of 
+   :py:func:`mlflow.pyfunc.PyFuncModel.predict` (without `params` argument), briefly it should:
+
+   * Has ``data`` as the only argument, which can be a ``pandas.Dataframe``, ``numpy.ndarray``, python list, dictionary or scipy matrix.
+   * Returns one of ``pandas.DataFrame``, ``pandas.Series``, ``numpy.ndarray`` or list. 
+3. Set ``model=None``, and put model outputs in `data`. Only applicable when the data is a Pandas dataframe.
+
+Evaluating with an MLflow Model
+---------------------------------
+
+For detailed instruction on how to convert your model into a ``mlflow.pyfunc.PyFuncModel`` instance, please read
+`this doc <https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#creating-custom-pyfunc-models>`_. But in short,
+to evaluate your model as an MLflow model, we recomment following the steps below:
+
+1. Convert your LLM to MLflow model and log it to MLflow server by ``log_model``. Each flavor (``opeanai``, ``pytorch``, ...) 
+   has its own ``log_model`` API, e.g., :py:func:`mlflow.openai.log_model()`:
+
+   .. code-block:: python
+
+        with mlflow.start_run():
+            system_prompt = "Answer the following question in two sentences"
+            # Wrap "gpt-3.5-turbo" as an MLflow model.
+            logged_model_info = mlflow.openai.log_model(
+                model="gpt-3.5-turbo",
+                task=openai.ChatCompletion,
+                artifact_path="model",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": "{question}"},
+                ],
+            )
+2. Use the URI of logged model as the model instance in ``mlflow.evaluate()``:
+   
+   .. code-block:: python
+
+        results = mlflow.evaluate(
+            logged_model_info.model_uri,
+            eval_data,
+            targets="ground_truth",
+            model_type="question-answering",
+        )
+
+Evaluating with a Custom Function
+----------------------------------
+
+As of MLflow 2.8.0, :py:func:`mlflow.evaluate()` supports evaluating a python function without requiring 
+logging the model to MLflow. This is useful when you don't want to log the model and just want to evaluate
+it. The following example uses :py:func:`mlflow.evaluate()` to evaluate a function. You also need to set
+up OpenAI authentication to run the code below.
+
+.. code-block:: python
+
+    eval_data = pd.DataFrame(
+        {
+            "inputs": [
+                "What is MLflow?",
+                "What is Spark?",
+            ],
+            "ground_truth": [
+                "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.",
+                "Apache Spark is an open-source, distributed computing system designed for big data processing and analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, offering improvements in speed and ease of use. Spark provides libraries for various tasks such as data ingestion, processing, and analysis through its components like Spark SQL for structured data, Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
+            ],
+        }
+    )
+
+
+    def openai_qa(inputs):
+        answers = []
+        system_prompt = "Please answer the following question in formal language."
+        for index, row in inputs.iterrows():
+            completion = openai.ChatCompletion.create(
+                model="gpt-3.5-turbo",
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": "{row}"},
+                ],
+            )
+            answers.append(completion.choices[0].message.content)
+
+        return answers
+
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            openai_qa,
+            eval_data,
+            model_type="question-answering",
+        )
+
+Evaluating with a Static Dataset
+----------------------------------
+
+For MLflow >= 2.8.0, :py:func:`mlflow.evaluate()` supports evaluating a static dataset without specifying a model.
+This is useful when you save the model output to a column in a Pandas DataFrame or an MLflow PandasDataset, and
+want to evaluate the static dataset without re-running the model.
+
+If you are using a Pandas DataFrame, you must specify the column name that contains the model output using the
+top-level ``predictions`` parameter in :py:func:`mlflow.evaluate()`:
+
+
+.. code-block:: python
+
+    import mlflow
+    import pandas as pd
+
+    eval_data = pd.DataFrame(
+        {
+            "inputs": [
+                "What is MLflow?",
+                "What is Spark?",
+            ],
+            "ground_truth": [
+                "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. "
+                "It was developed by Databricks, a company that specializes in big data and machine learning solutions. "
+                "MLflow is designed to address the challenges that data scientists and machine learning engineers "
+                "face when developing, training, and deploying machine learning models.",
+                "Apache Spark is an open-source, distributed computing system designed for big data processing and "
+                "analytics. It was developed in response to limitations of the Hadoop MapReduce computing model, "
+                "offering improvements in speed and ease of use. Spark provides libraries for various tasks such as "
+                "data ingestion, processing, and analysis through its components like Spark SQL for structured data, "
+                "Spark Streaming for real-time data processing, and MLlib for machine learning tasks",
+            ],
+            "predictions": [
+                "MLflow is an open-source platform that provides handy tools to manage Machine Learning workflow "
+                "lifecycle in a simple way",
+                "Spark is a popular open-source distributed computing system designed for big data processing and analytics.",
+            ],
+        }
+    )
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            data=eval_data,
+            targets="ground_truth",
+            predictions="predictions",
+            extra_metrics=[mlflow.metrics.answer_similarity()],
+            evaluators="default",
+        )
+        print(f"See aggregated evaluation results below: \n{results.metrics}")
+
+        eval_table = results.tables["eval_results_table"]
+        print(f"See evaluation table below: \n{eval_table}")
+
+
+View Evaluation Results
+========================
+
+View Evaluation Results via Code
+-----------------------------------
+
+``mlflow.evaluate()`` returns the evaluation results as an :py:func:`mlflow.models.EvaluationResult` instace. 
+To see the score on selected metrics, you can check:
+
+* ``metrics``: stores the aggregated results, like average/variance across the evaluation dataset. Let's take a second
+  pass on the code example above and focus on printing out the aggregated results.
+  
+  .. code-block:: python
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            data=eval_data,
+            targets="ground_truth",
+            predictions="predictions",
+            extra_metrics=[mlflow.metrics.answer_similarity()],
+            evaluators="default",
+        )
+        print(f"See aggregated evaluation results below: \n{results.metrics}")
+
+* ``tables["eval_results_table"]``: stores the per-row evaluation results. 
+
+  .. code-block:: python
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            data=eval_data,
+            targets="ground_truth",
+            predictions="predictions",
+            extra_metrics=[mlflow.metrics.answer_similarity()],
+            evaluators="default",
+        )
+        print(
+            f"See per-data evaluation results below: \n{results.tables['eval_results_table']}"
+        )
+
+
+View Evaluation Results via MLflow UI
+--------------------------------------  
+
+Your evaluation result is automatically logged into MLflow server, so you can view your evaluation results directly from the
+MLflow UI. To view the evaluation results on MLflow UI, please follow the steps below:
+
+1. Go to the experiment view of your MLflow experiment.
+2. Select the "Evaluation" tab.
+3. Select the runs you want to check evaluation results.
+4. Select the metrics from the dropdown menu on the right side. 
+
+Please see the screenshot below for clarity:
+
+
+.. figure:: ../../_static/images/llm_evaluate_experiment_view.png
+    :width: 1024px
+    :align: center
+    :alt: Demo UI of MLflow evaluate
\ No newline at end of file
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 45201a21d852e..d3d7897daebf3 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -2551,7 +2551,7 @@ Model with the ``openai`` flavor as a dictionary of the model's attributes.
 
 Example:
 
-.. literalinclude:: ../../examples/openai/pyfunc.py
+.. literalinclude:: ../../examples/openai/chat_completions.py
     :language: python
 
 
@@ -3644,10 +3644,10 @@ each model:
 For additional examples demonstrating the use of ``mlflow.evaluate()`` with LLMs, check out the
 `MLflow LLMs example repository <https://github.com/mlflow/mlflow/tree/master/examples/llms>`_.
 
-Evaluating with Custom Metrics
+Evaluating with Extra Metrics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-If the default set of metrics is insufficient, you can supply ``custom_metrics`` and ``custom_artifacts``
+If the default set of metrics is insufficient, you can supply ``extra_metrics`` and ``custom_artifacts``
 to :py:func:`mlflow.evaluate()` to produce custom metrics and artifacts for the model(s) that you're evaluating.
 The following `short example from the MLflow GitHub Repository
 <https://github.com/mlflow/mlflow/blob/master/examples/evaluation/evaluate_with_custom_metrics.py>`_
@@ -3660,6 +3660,43 @@ uses :py:func:`mlflow.evaluate()` with a custom metric function to evaluate the
 For a more comprehensive custom metrics usage example, refer to `this example from the MLflow GitHub Repository
 <https://github.com/mlflow/mlflow/blob/master/examples/evaluation/evaluate_with_custom_metrics_comprehensive.py>`_.
 
+Evaluating with a Function
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+As of MLflow 2.8.0, :py:func:`mlflow.evaluate()` supports evaluating a python function without requiring 
+logging the model to MLflow. This is useful when you don't want to log the model and just want to evaluate
+it. The following example uses :py:func:`mlflow.evaluate()` to evaluate a function:
+
+
+.. literalinclude:: ../../examples/evaluation/evaluate_with_function.py
+    :language: python
+
+Evaluating with a Static Dataset
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+As of MLflow 2.8.0, :py:func:`mlflow.evaluate()` supports evaluating a static dataset without specifying a model.
+This is useful when you save the model output to a column in a Pandas DataFrame or an MLflow PandasDataset, and
+want to evaluate the static dataset without re-running the model.
+
+If you are using a Pandas DataFrame, you must specify the column name that contains the model output using the
+top-level ``predictions`` parameter in :py:func:`mlflow.evaluate()`:
+
+.. code-block:: python
+
+    mlflow.evaluate(data=pandas_df, predictions="model_output", ...)
+
+If you are using an MLflow PandasDataset, you must specify the column name that contains the model output using
+the ``predictions`` parameter in :py:func:`mlflow.data.from_pandas()`, and specify ``None`` for the
+``predictions`` parameter in :py:func:`mlflow.evaluate()`:
+
+.. code-block:: python
+
+    dataset = mlflow.data.from_pandas(pandas_df, predictions="model_output")
+    mlflow.evaluate(data=pandas_df, predictions=None, ...)
+
+The following example uses :py:func:`mlflow.evaluate()` to evaluate a static dataset:
+
+.. literalinclude:: ../../examples/evaluation/evaluate_with_static_dataset.py
+    :language: python
+
 .. _model-validation:
 
 Performing Model Validation
diff --git a/docs/source/python_api/mlflow.data.rst b/docs/source/python_api/mlflow.data.rst
index 8bd15534f9824..4913528d0cbf9 100644
--- a/docs/source/python_api/mlflow.data.rst
+++ b/docs/source/python_api/mlflow.data.rst
@@ -6,7 +6,7 @@ runs with MLflow Tracking, as well as retrieve dataset information from runs. It
 following important interfaces:
 
 * :py:class:`Dataset <mlflow.data.dataset.Dataset>`: Represents a dataset used in model training or
-  evaluation, including features, targets, and metadata such as the dataset's name, digest (hash)
+  evaluation, including features, targets, predictions, and metadata such as the dataset's name, digest (hash)
   schema, profile, and source. You can log this metadata to a run in MLflow Tracking using
   the :py:func:`mlflow.log_input()` API. ``mlflow.data`` provides APIs for constructing
   :py:class:`Datasets <mlflow.data.dataset.Dataset>` from a variety of Python data objects, including
diff --git a/docs/source/python_api/mlflow.metrics.rst b/docs/source/python_api/mlflow.metrics.rst
index 5503fd2cf85c5..7903cc343f49a 100644
--- a/docs/source/python_api/mlflow.metrics.rst
+++ b/docs/source/python_api/mlflow.metrics.rst
@@ -12,7 +12,7 @@ The following code demonstrates how to use :py:func:`mlflow.evaluate()` with an
 .. code-block:: python
 
     import mlflow
-    from mlflow.metrics import EvaluationExample, correctness
+    from mlflow.metrics import EvaluationExample, answer_similarity
 
     eval_df = pd.DataFrame(
         {
@@ -41,83 +41,72 @@ The following code demonstrates how to use :py:func:`mlflow.evaluate()` with an
             "engineers face when developing, training, and deploying machine learning models."
         },
     )
-    correctness_metric = correctness(examples=[example])
+    answer_similarity_metric = answer_similarity(examples=[example])
     results = mlflow.evaluate(
         logged_model.model_uri,
         eval_df,
         targets="ground_truth",
         model_type="question-answering",
-        extra_metrics=[correctness_metric],
+        extra_metrics=[answer_similarity_metric],
     )
 
 Evaluation results are stored as :py:class:`MetricValue <mlflow.metrics.MetricValue>`. Aggregate results are logged to the MLflow run as metrics, while per-example results are logged to the MLflow run as artifacts in the form of an evaluation table.
 
 .. autoclass:: mlflow.metrics.MetricValue
 
-We provide the following builtin :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` for evaluating models. These metrics are computed automatically depending on the ``model_type``. For more information on the ``model_type`` parameter, see :py:func:`mlflow.evaluate()` API.
+We provide the following builtin factory functions to create :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` for evaluating models. These metrics are computed automatically depending on the ``model_type``. For more information on the ``model_type`` parameter, see :py:func:`mlflow.evaluate()` API.
 
-.. autodata:: mlflow.metrics.mae
-   :annotation:
+.. autofunction:: mlflow.metrics.mae
 
-.. autodata:: mlflow.metrics.mape
-   :annotation:
+.. autofunction:: mlflow.metrics.mape
 
-.. autodata:: mlflow.metrics.max_error
-   :annotation:
+.. autofunction:: mlflow.metrics.max_error
 
-.. autodata:: mlflow.metrics.mse
-   :annotation:
+.. autofunction:: mlflow.metrics.mse
 
-.. autodata:: mlflow.metrics.rmse
-   :annotation:
+.. autofunction:: mlflow.metrics.rmse
 
-.. autodata:: mlflow.metrics.r2_score
-   :annotation:
+.. autofunction:: mlflow.metrics.r2_score
 
-.. autodata:: mlflow.metrics.precision_score
-   :annotation:
+.. autofunction:: mlflow.metrics.precision_score
 
-.. autodata:: mlflow.metrics.recall_score
-   :annotation:
+.. autofunction:: mlflow.metrics.recall_score
 
-.. autodata:: mlflow.metrics.f1_score
-   :annotation:
+.. autofunction:: mlflow.metrics.f1_score
 
-.. autodata:: mlflow.metrics.ari_grade_level
-   :annotation:
+.. autofunction:: mlflow.metrics.ari_grade_level
 
-.. autodata:: mlflow.metrics.flesch_kincaid_grade_level
-   :annotation:
+.. autofunction:: mlflow.metrics.flesch_kincaid_grade_level
 
-.. autodata:: mlflow.metrics.perplexity
-   :annotation:
+.. autofunction:: mlflow.metrics.rouge1
 
-.. autodata:: mlflow.metrics.rouge1
-   :annotation:
+.. autofunction:: mlflow.metrics.rouge2
 
-.. autodata:: mlflow.metrics.rouge2
-   :annotation:
+.. autofunction:: mlflow.metrics.rougeL
 
-.. autodata:: mlflow.metrics.rougeL
-   :annotation:
+.. autofunction:: mlflow.metrics.rougeLsum
 
-.. autodata:: mlflow.metrics.rougeLsum
-   :annotation:
+.. autofunction:: mlflow.metrics.precision_at_k
 
-.. autodata:: mlflow.metrics.toxicity
-   :annotation:
+.. autofunction:: mlflow.metrics.toxicity
+
+.. autofunction:: mlflow.metrics.token_count
+
+.. autofunction:: mlflow.metrics.latency
 
 Users create their own :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` using the :py:func:`make_metric <mlflow.metrics.make_metric>` factory function
 
 .. autofunction:: mlflow.metrics.make_metric
 
-We provide the following pre-canned "intelligent" :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` for evaluating text models. These metrics use an LLM to evaluate the quality of a model's output text. The following factory functions help you customize the intelligent metric to your use case.
+We provide the following pre-canned "intelligent" :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` for evaluating text models. These metrics use an LLM to evaluate the quality of a model's output text. Note that your use of a third party LLM service (e.g., OpenAI) for evaluation may be subject to and governed by the LLM service's terms of use. The following factory functions help you customize the intelligent metric to your use case.
+
+.. autofunction:: mlflow.metrics.answer_similarity
 
-.. autofunction:: mlflow.metrics.correctness
+.. autofunction:: mlflow.metrics.answer_correctness
 
-.. autofunction:: mlflow.metrics.strict_correctness
+.. autofunction:: mlflow.metrics.faithfulness
 
-.. autofunction:: mlflow.metrics.relevance
+.. autofunction:: mlflow.metrics.answer_relevance
 
 Users can also create their own LLM based :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetric>` using the :py:func:`make_genai_metric <mlflow.metrics.make_genai_metric>` factory function.
 
@@ -131,4 +120,4 @@ When using LLM based :py:class:`EvaluationMetric <mlflow.metrics.EvaluationMetri
     :members:
     :undoc-members:
     :show-inheritance:
-    :exclude-members: MetricValue, EvaluationMetric, make_metric, make_genai_metric, EvaluationExample, ari_grade_level, flesch_kincaid_grade_level, perplexity, rouge1, rouge2, rougeL, rougeLsum, toxicity, correctness, strict_correctness, relevance, mae, mape, max_error, mse, rmse, r2_score, precision_score, recall_score, f1_score
+    :exclude-members: MetricValue, EvaluationMetric, make_metric, make_genai_metric, EvaluationExample, ari_grade_level, flesch_kincaid_grade_level, rouge1, rouge2, rougeL, rougeLsum, toxicity, answer_similarity, answer_correctness, faithfulness, answer_relevance, mae, mape, max_error, mse, rmse, r2_score, precision_score, recall_score, f1_score, token_count, latency
diff --git a/docs/source/python_api/mlflow.utils.rst b/docs/source/python_api/mlflow.utils.rst
new file mode 100644
index 0000000000000..f45e107f91996
--- /dev/null
+++ b/docs/source/python_api/mlflow.utils.rst
@@ -0,0 +1,10 @@
+mlflow.utils
+==================
+
+.. automodule:: mlflow.utils.async_logging
+    :members:
+    :undoc-members:
+
+.. automodule:: mlflow.utils.async_logging.run_operations
+    :members:
+    :undoc-members:
\ No newline at end of file
diff --git a/docs/theme/mlflow/static/css/theme.css b/docs/theme/mlflow/static/css/theme.css
index f5eb1d889a226..2516c81845aca 100644
--- a/docs/theme/mlflow/static/css/theme.css
+++ b/docs/theme/mlflow/static/css/theme.css
@@ -5118,7 +5118,6 @@ footer span.commit .rst-content tt,
 }
 .rst-content .highlighted {
     background: #F1C40F;
-    display: inline-block;
     font-weight: bold;
     padding: 0 6px
 }
diff --git a/examples/databricks/multipart.py b/examples/databricks/multipart.py
index bbf0667913ea1..41a116f7e1688 100644
--- a/examples/databricks/multipart.py
+++ b/examples/databricks/multipart.py
@@ -43,7 +43,7 @@ def show_system_info():
 
 
 def md5_checksum(path):
-    file_hash = hashlib.md5()
+    file_hash = hashlib.sha256()
     with open(path, "rb") as f:
         while chunk := f.read(1024**2):
             file_hash.update(chunk)
diff --git a/examples/evaluation/LLM Evaluation Examples -- QA.ipynb b/examples/evaluation/LLM Evaluation Examples -- QA.ipynb
new file mode 100644
index 0000000000000..a3dfcef0454cb
--- /dev/null
+++ b/examples/evaluation/LLM Evaluation Examples -- QA.ipynb	
@@ -0,0 +1,1755 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "0a87a4cd-8a01-4e35-8a71-eaf91ed4ddd2",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# LLM Evaluation with MLflow Example Notebook\n",
+    "\n",
+    "In this notebook, we will demonstrate how to evaluate various LLMs and RAG systems with MLflow, leveraging simple metrics such as perplexity and toxicity, as well as LLM-judged metrics such as relevance, and even custom LLM-judged metrics such as professionalism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "cce6412a-2279-4ec1-a344-fa76fec70ee1",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Set OpenAI Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "fb946228-62fb-4d68-9732-75935c9cb401",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "bec25067-224d-4ee8-9b5d-0beeb6cde684",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = \"redacted\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import openai\n",
+    "import pandas as pd\n",
+    "\n",
+    "import mlflow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "a9bbfc03-793e-4b95-b009-ef30dccd7e7d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Basic Question-Answering Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "ff253b9e-59e8-40e0-92d8-8f9ef85348fd",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a test case of `inputs` that will be passed into the model and `ground_truth` which will be used to compare against the generated output from the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6199fb3f-5951-42fe-891a-2227010b630a",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "eval_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"inputs\": [\n",
+    "            \"How does useEffect() work?\",\n",
+    "            \"What does the static keyword in a function mean?\",\n",
+    "            \"What does the 'finally' block in Python do?\",\n",
+    "            \"What is the difference between multiprocessing and multithreading?\",\n",
+    "        ],\n",
+    "        \"ground_truth\": [\n",
+    "            \"The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.\",\n",
+    "            \"Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.\",\n",
+    "            \"'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.\",\n",
+    "            \"Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.\",\n",
+    "        ],\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "06825224-49bd-452d-8dab-b11ca8130017",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a simple OpenAI model that asks gpt-3.5 to answer the question in two sentences. Call `mlflow.evaluate()` with the model and evaluation dataframe. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "7b67eb6f-c91a-4f9a-ac0d-01fd22b087c8",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:35:53 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.7.2/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d4bf4e330c1541819217be2deee3dd2b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d75c7bb35a16424c84e727f09a45de3f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:35:58 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:36:04 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:36:04 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:36:04 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:36:04 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:36:05 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:36:05 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:36:05 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:36:05 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6a1638efbf604adcb5fce17dfa781ef3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:36:08 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:36:08 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:36:08 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'perplexity/v1/mean': 19.467615604400635,\n",
+       " 'perplexity/v1/variance': 18.95112384684103,\n",
+       " 'perplexity/v1/p90': 23.42769298553467,\n",
+       " 'flesch_kincaid_grade_level/v1/mean': 14.55,\n",
+       " 'flesch_kincaid_grade_level/v1/variance': 26.192499999999995,\n",
+       " 'flesch_kincaid_grade_level/v1/p90': 20.26,\n",
+       " 'ari_grade_level/v1/mean': 16.2,\n",
+       " 'ari_grade_level/v1/variance': 38.725,\n",
+       " 'ari_grade_level/v1/p90': 23.240000000000002,\n",
+       " 'exact_match/v1': 0.0}"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    system_prompt = \"Answer the following question in two sentences\"\n",
+    "    basic_qa_model = mlflow.openai.log_model(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        task=openai.ChatCompletion,\n",
+    "        artifact_path=\"model\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": \"{question}\"},\n",
+    "        ],\n",
+    "    )\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        targets=\"ground_truth\",  # specify which column corresponds to the expected output\n",
+    "        model_type=\"question-answering\",  # model type indicates which metrics are relevant for this task\n",
+    "        evaluators=\"default\",\n",
+    "    )\n",
+    "results.metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "6d078816-1de1-4a6e-b757-5c9cbe056638",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Inspect the evaluation results table as a dataframe to see row-by-row metrics to further assess model performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "28688e6c-6a2d-40bd-a737-58cfe70f2e10",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d33967e4787748c69ff9873b6e46227c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a hook in React that allows you to perform side effects in functional components. It takes a function as its first argument and runs that function after the component has rendered and whenever any of its dependencies have changed.</td><td>45</td><td>23.4797859192</td><td>11.3</td><td>12.4</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The static keyword in a function means that the function can only be accessed within the same file it is declared in, and is not accessible to other files. It is used to limit the scope of the function and improve encapsulation.</td><td>46</td><td>18.20271492</td><td>10.3</td><td>10.0</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python is used to define a block of code that will be executed regardless of whether an exception has occurred or not, ensuring that certain clean-up actions are always performed. It is typically used to close files, release resources, or clean up any operations that need to be done before exiting a try-except block.</td><td>68</td><td>23.306142807</td><td>13.4</td><td>16.1</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing involves the simultaneous execution of multiple tasks on multiple processors, while multithreading involves the execution of multiple threads within a single process, allowing for concurrent execution of different parts of the program.</td><td>39</td><td>12.8818187714</td><td>23.2</td><td>26.3</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a hook in React that allows you to perform side effects in functional components. It takes a function as its first argument and runs that function after the component has rendered and whenever any of its dependencies have changed.",
+         45,
+         23.4797859192,
+         11.3,
+         12.4
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The static keyword in a function means that the function can only be accessed within the same file it is declared in, and is not accessible to other files. It is used to limit the scope of the function and improve encapsulation.",
+         46,
+         18.20271492,
+         10.3,
+         10
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python is used to define a block of code that will be executed regardless of whether an exception has occurred or not, ensuring that certain clean-up actions are always performed. It is typically used to close files, release resources, or clean up any operations that need to be done before exiting a try-except block.",
+         68,
+         23.306142807,
+         13.4,
+         16.1
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing involves the simultaneous execution of multiple tasks on multiple processors, while multithreading involves the execution of multiple threads within a single process, allowing for concurrent execution of different parts of the program.",
+         39,
+         12.8818187714,
+         23.2,
+         26.3
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "1a7363c9-3b73-4e3f-bf7c-1d6887fb4f9e",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## LLM-judged correctness with OpenAI GPT-4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "cd23fe79-cfbf-42a7-a3f3-14badfe20db5",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Construct an answer similarity metric using the `answer_similarity()` metric factory function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "88b35b52-5b8f-4b72-9de8-fec05f01e722",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EvaluationMetric(name=answer_similarity, greater_is_better=True, long_name=answer_similarity, version=v1, metric_details=\n",
+      "Task:\n",
+      "You are an impartial judge. You will be given an input that was sent to a machine\n",
+      "learning model, and you will be given an output that the model produced. You\n",
+      "may also be given additional information that was used by the model to generate the output.\n",
+      "\n",
+      "Your task is to determine a numerical score called answer_similarity based on the input and output.\n",
+      "A definition of answer_similarity and a grading rubric are provided below.\n",
+      "You must use the grading rubric to determine your score. You must also justify your score.\n",
+      "\n",
+      "Examples could be included below for reference. Make sure to use them as references and to\n",
+      "understand them before completing the task.\n",
+      "\n",
+      "Input:\n",
+      "{input}\n",
+      "\n",
+      "Output:\n",
+      "{output}\n",
+      "\n",
+      "{grading_context_columns}\n",
+      "\n",
+      "Metric definition:\n",
+      "Answer similarity is evaluated on the degree of semantic similarity of the provided output to the provided targets, which is the ground truth. Scores can be assigned based on the gradual similarity in meaning and description to the provided targets, where a higher score indicates greater alignment between the provided output and provided targets.\n",
+      "\n",
+      "Grading rubric:\n",
+      "Answer similarity: Below are the details for different scores:\n",
+      "- Score 1: the output has little to no semantic similarity to the provided targets.\n",
+      "- Score 2: the output displays partial semantic similarity to the provided targets on some aspects.\n",
+      "- Score 3: the output has moderate semantic similarity to the provided targets.\n",
+      "- Score 4: the output aligns with the provided targets in most aspects and has substantial semantic similarity.\n",
+      "- Score 5: the output closely aligns with the provided targets in all significant aspects.\n",
+      "\n",
+      "Examples:\n",
+      "\n",
+      "Input:\n",
+      "What is MLflow?\n",
+      "\n",
+      "Output:\n",
+      "MLflow is an open-source platform for managing machine learning workflows, including experiment tracking, model packaging, versioning, and deployment, simplifying the ML lifecycle.\n",
+      "\n",
+      "Additional information used by the model:\n",
+      "key: ground_truth\n",
+      "value:\n",
+      "MLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.\n",
+      "\n",
+      "score: 4\n",
+      "justification: The definition effectively explains what MLflow is its purpose, and its developer. It could be more concise for a 5-score.\n",
+      "        \n",
+      "\n",
+      "You must return the following fields in your response one below the other:\n",
+      "score: Your numerical score for the model's answer_similarity based on the rubric\n",
+      "justification: Your step-by-step reasoning about the model's answer_similarity score\n",
+      "    )\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mlflow.metrics import EvaluationExample, answer_similarity\n",
+    "\n",
+    "# Create an example to describe what answer_similarity means like for this problem.\n",
+    "example = EvaluationExample(\n",
+    "    input=\"What is MLflow?\",\n",
+    "    output=\"MLflow is an open-source platform for managing machine \"\n",
+    "    \"learning workflows, including experiment tracking, model packaging, \"\n",
+    "    \"versioning, and deployment, simplifying the ML lifecycle.\",\n",
+    "    score=4,\n",
+    "    justification=\"The definition effectively explains what MLflow is \"\n",
+    "    \"its purpose, and its developer. It could be more concise for a 5-score.\",\n",
+    "    grading_context={\n",
+    "        \"ground_truth\": \"MLflow is an open-source platform for managing \"\n",
+    "        \"the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, \"\n",
+    "        \"a company that specializes in big data and machine learning solutions. MLflow is \"\n",
+    "        \"designed to address the challenges that data scientists and machine learning \"\n",
+    "        \"engineers face when developing, training, and deploying machine learning models.\"\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "# Construct the metric using OpenAI GPT-4 as the judge\n",
+    "answer_similarity_metric = answer_similarity(model=\"openai:/gpt-4\", examples=[example])\n",
+    "\n",
+    "print(answer_similarity_metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "d627f7ab-a7e1-430d-9431-9ce4bd810fa7",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate()` again but with your new `answer_similarity_metric`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "cae9d80b-39a2-4e98-ac08-bfa5ba387b8f",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a9ad395386743a0a44cce1875382e27",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:37:06 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:37:12 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:37:12 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:37:12 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:37:12 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:37:13 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:37:13 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:37:13 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:37:13 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c5bd7822f964bdebdb32d219376765e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:37:15 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:37:15 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:37:15 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: answer_similarity\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'perplexity/v1/mean': 14.810191035270691,\n",
+       " 'perplexity/v1/variance': 55.66698687548323,\n",
+       " 'perplexity/v1/p90': 23.251440143585206,\n",
+       " 'flesch_kincaid_grade_level/v1/mean': 13.649999999999999,\n",
+       " 'flesch_kincaid_grade_level/v1/variance': 6.9025,\n",
+       " 'flesch_kincaid_grade_level/v1/p90': 16.4,\n",
+       " 'ari_grade_level/v1/mean': 16.05,\n",
+       " 'ari_grade_level/v1/variance': 9.427500000000002,\n",
+       " 'ari_grade_level/v1/p90': 19.32,\n",
+       " 'exact_match/v1': 0.0,\n",
+       " 'answer_similarity/v1/mean': 4.25,\n",
+       " 'answer_similarity/v1/variance': 0.1875,\n",
+       " 'answer_similarity/v1/p90': 4.7}"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        targets=\"ground_truth\",\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[answer_similarity_metric],  # use the answer similarity metric created above\n",
+    "    )\n",
+    "results.metrics"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "df98aa92-4ce4-43dd-9677-68911a0a103d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "See the row-by-row LLM-judged answer similarity score and justifications"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6f41f22d-e287-4aad-8231-986252ad6682",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "92ba7ee8d4194761b75415c1cc16c211",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>answer_similarity/v1/score</th><th>answer_similarity/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a function in React that allows you to perform side effects in a functional component. It takes two arguments: a callback function that will be run after the component renders, and an array of dependencies to determine when the callback should be invoked.</td><td>51</td><td>15.5819368362</td><td>12.1</td><td>13.5</td><td>4</td><td>The output provided by the model aligns well with the additional information provided. Both the output and the additional information explain that useEffect() is a function in React that performs actions after the component renders. The output also mentions the two arguments that useEffect() takes, which is not mentioned in the additional information. However, the output does not mention that React will remember the function passed to useEffect() and call it after performing the DOM updates, which is mentioned in the additional information. Therefore, while the output is largely accurate and aligns with the additional information in most aspects, it does not align in all significant aspects, which is why it receives a score of 4.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>In C++, the static keyword used in a function declaration means that the function is limited to the scope of the translation unit and cannot be accessed by other files. In C#, the static keyword used in a function declaration means that the function belongs to the class itself and not to any specific instance of the class.</td><td>62</td><td>10.348818779</td><td>12.2</td><td>14.7</td><td>4</td><td>The output provided by the model aligns well with the target information. It correctly explains the meaning of the static keyword in a function in both C++ and C#. However, it does not explicitly mention that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any, which is mentioned in the target information. Therefore, while the output is largely accurate and informative, it lacks some details present in the target information, which prevents it from achieving a perfect score.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python is used to specify code that will always be executed, whether or not an exception is raised in the try block. This block is typically used for clean-up actions, such as closing files or releasing resources, to ensure they are always performed, regardless of any exceptions.</td><td>60</td><td>26.5383701324</td><td>12.1</td><td>14.7</td><td>5</td><td>The output provided by the model aligns very closely with the provided targets. Both the output and the targets explain that the 'finally' block in Python is used to specify code that will always be executed, regardless of whether an exception is raised in the try block. The output also adds that this block is typically used for clean-up actions, which is an important aspect of the 'finally' block's functionality. Therefore, the output has a high degree of semantic similarity to the provided targets, warranting a score of 5.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing is the use of multiple processors or cores to execute multiple tasks simultaneously, while multithreading is the use of multiple threads within a single process to execute different parts of the code concurrently. In simple terms, multiprocessing involves running multiple processes, while multithreading involves running multiple threads within a single process.</td><td>62</td><td>6.7716383934</td><td>18.2</td><td>21.3</td><td>4</td><td>The output provided by the model aligns closely with the provided targets. Both the output and the targets explain the difference between multiprocessing and multithreading, and they both mention that multiprocessing involves multiple processors or cores, while multithreading involves multiple threads within a single process. However, the output does not mention that each processor in multiprocessing can run one or more threads, which is mentioned in the targets. Therefore, the output aligns with the targets in most aspects, but not all, which is why a score of 4 is given.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a function in React that allows you to perform side effects in a functional component. It takes two arguments: a callback function that will be run after the component renders, and an array of dependencies to determine when the callback should be invoked.",
+         51,
+         15.5819368362,
+         12.1,
+         13.5,
+         4,
+         "The output provided by the model aligns well with the additional information provided. Both the output and the additional information explain that useEffect() is a function in React that performs actions after the component renders. The output also mentions the two arguments that useEffect() takes, which is not mentioned in the additional information. However, the output does not mention that React will remember the function passed to useEffect() and call it after performing the DOM updates, which is mentioned in the additional information. Therefore, while the output is largely accurate and aligns with the additional information in most aspects, it does not align in all significant aspects, which is why it receives a score of 4."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "In C++, the static keyword used in a function declaration means that the function is limited to the scope of the translation unit and cannot be accessed by other files. In C#, the static keyword used in a function declaration means that the function belongs to the class itself and not to any specific instance of the class.",
+         62,
+         10.348818779,
+         12.2,
+         14.7,
+         4,
+         "The output provided by the model aligns well with the target information. It correctly explains the meaning of the static keyword in a function in both C++ and C#. However, it does not explicitly mention that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any, which is mentioned in the target information. Therefore, while the output is largely accurate and informative, it lacks some details present in the target information, which prevents it from achieving a perfect score."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python is used to specify code that will always be executed, whether or not an exception is raised in the try block. This block is typically used for clean-up actions, such as closing files or releasing resources, to ensure they are always performed, regardless of any exceptions.",
+         60,
+         26.5383701324,
+         12.1,
+         14.7,
+         5,
+         "The output provided by the model aligns very closely with the provided targets. Both the output and the targets explain that the 'finally' block in Python is used to specify code that will always be executed, regardless of whether an exception is raised in the try block. The output also adds that this block is typically used for clean-up actions, which is an important aspect of the 'finally' block's functionality. Therefore, the output has a high degree of semantic similarity to the provided targets, warranting a score of 5."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing is the use of multiple processors or cores to execute multiple tasks simultaneously, while multithreading is the use of multiple threads within a single process to execute different parts of the code concurrently. In simple terms, multiprocessing involves running multiple processes, while multithreading involves running multiple threads within a single process.",
+         62,
+         6.7716383934,
+         18.2,
+         21.3,
+         4,
+         "The output provided by the model aligns closely with the provided targets. Both the output and the targets explain the difference between multiprocessing and multithreading, and they both mention that multiprocessing involves multiple processors or cores, while multithreading involves multiple threads within a single process. However, the output does not mention that each processor in multiprocessing can run one or more threads, which is mentioned in the targets. Therefore, the output aligns with the targets in most aspects, but not all, which is why a score of 4 is given."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "answer_similarity/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "answer_similarity/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "85402663-b9d7-4812-a7d2-32aa5b929687",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Custom LLM-judged metric for professionalism"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "a8765226-5d95-49e8-88d8-5ba442ea3b9b",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a custom metric that will be used to determine professionalism of the model outputs. Use `make_genai_metric` with a metric definition, grading prompt, grading example, and judge model configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "45cca2ec-e06b-4d51-9dde-3cc630df9244",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "EvaluationMetric(name=professionalism, greater_is_better=True, long_name=professionalism, version=v1, metric_details=\n",
+      "Task:\n",
+      "You are an impartial judge. You will be given an input that was sent to a machine\n",
+      "learning model, and you will be given an output that the model produced. You\n",
+      "may also be given additional information that was used by the model to generate the output.\n",
+      "\n",
+      "Your task is to determine a numerical score called professionalism based on the input and output.\n",
+      "A definition of professionalism and a grading rubric are provided below.\n",
+      "You must use the grading rubric to determine your score. You must also justify your score.\n",
+      "\n",
+      "Examples could be included below for reference. Make sure to use them as references and to\n",
+      "understand them before completing the task.\n",
+      "\n",
+      "Input:\n",
+      "{input}\n",
+      "\n",
+      "Output:\n",
+      "{output}\n",
+      "\n",
+      "{grading_context_columns}\n",
+      "\n",
+      "Metric definition:\n",
+      "Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is tailored to the context and audience. It often involves avoiding overly casual language, slang, or colloquialisms, and instead using clear, concise, and respectful language\n",
+      "\n",
+      "Grading rubric:\n",
+      "Professionalism: If the answer is written using a professional tone, below are the details for different scores: - Score 1: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for professional contexts.- Score 2: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in some informal professional settings.- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. - Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for business or academic settings. - Score 5: Language is excessively formal, respectful, and avoids casual elements. Appropriate for the most formal settings such as textbooks. \n",
+      "\n",
+      "Examples:\n",
+      "\n",
+      "Input:\n",
+      "What is MLflow?\n",
+      "\n",
+      "Output:\n",
+      "MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps you track experiments, package your code and models, and collaborate with your team, making the whole ML workflow smoother. It's like your Swiss Army knife for machine learning!\n",
+      "\n",
+      "\n",
+      "\n",
+      "score: 2\n",
+      "justification: The response is written in a casual tone. It uses contractions, filler words such as 'like', and exclamation points, which make it sound less professional. \n",
+      "        \n",
+      "\n",
+      "You must return the following fields in your response one below the other:\n",
+      "score: Your numerical score for the model's professionalism based on the rubric\n",
+      "justification: Your step-by-step reasoning about the model's professionalism score\n",
+      "    )\n"
+     ]
+    }
+   ],
+   "source": [
+    "from mlflow.metrics import EvaluationExample, make_genai_metric\n",
+    "\n",
+    "professionalism_metric = make_genai_metric(\n",
+    "    name=\"professionalism\",\n",
+    "    definition=(\n",
+    "        \"Professionalism refers to the use of a formal, respectful, and appropriate style of communication that is tailored to the context and audience. It often involves avoiding overly casual language, slang, or colloquialisms, and instead using clear, concise, and respectful language\"\n",
+    "    ),\n",
+    "    grading_prompt=(\n",
+    "        \"Professionalism: If the answer is written using a professional tone, below \"\n",
+    "        \"are the details for different scores: \"\n",
+    "        \"- Score 1: Language is extremely casual, informal, and may include slang or colloquialisms. Not suitable for professional contexts.\"\n",
+    "        \"- Score 2: Language is casual but generally respectful and avoids strong informality or slang. Acceptable in some informal professional settings.\"\n",
+    "        \"- Score 3: Language is balanced and avoids extreme informality or formality. Suitable for most professional contexts. \"\n",
+    "        \"- Score 4: Language is noticeably formal, respectful, and avoids casual elements. Appropriate for business or academic settings. \"\n",
+    "        \"- Score 5: Language is excessively formal, respectful, and avoids casual elements. Appropriate for the most formal settings such as textbooks. \"\n",
+    "    ),\n",
+    "    examples=[\n",
+    "        EvaluationExample(\n",
+    "            input=\"What is MLflow?\",\n",
+    "            output=(\n",
+    "                \"MLflow is like your friendly neighborhood toolkit for managing your machine learning projects. It helps you track experiments, package your code and models, and collaborate with your team, making the whole ML workflow smoother. It's like your Swiss Army knife for machine learning!\"\n",
+    "            ),\n",
+    "            score=2,\n",
+    "            justification=(\n",
+    "                \"The response is written in a casual tone. It uses contractions, filler words such as 'like', and exclamation points, which make it sound less professional. \"\n",
+    "            ),\n",
+    "        )\n",
+    "    ],\n",
+    "    version=\"v1\",\n",
+    "    model=\"openai:/gpt-4\",\n",
+    "    parameters={\"temperature\": 0.0},\n",
+    "    grading_context_columns=[],\n",
+    "    aggregations=[\"mean\", \"variance\", \"p90\"],\n",
+    "    greater_is_better=True,\n",
+    ")\n",
+    "\n",
+    "print(professionalism_metric)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "bc615396-b1c1-4302-872d-d19be010382a",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "TODO: Try out your new professionalism metric on a sample output to make sure it behaves as you expect"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "0ca7e945-113a-49ac-8324-2f94efa45771",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate` with your new professionalism metric. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "07bb41ae-c878-4384-b36e-3dfb9b8ac6d9",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b7b987f9e46430fac997e9867255c5f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:22 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:41:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:41:27 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:41:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:41:27 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:41:28 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:41:28 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:41:28 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:41:28 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd93cae37c8245079a8ca93e160eeb0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:41:31 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:41:31 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:41:31 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: professionalism\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'perplexity/v1/mean': 19.43236994743347, 'perplexity/v1/variance': 5.924225461480717, 'perplexity/v1/p90': 21.905458068847658, 'flesch_kincaid_grade_level/v1/mean': 15.4, 'flesch_kincaid_grade_level/v1/variance': 28.564999999999998, 'flesch_kincaid_grade_level/v1/p90': 21.28, 'ari_grade_level/v1/mean': 18.625, 'ari_grade_level/v1/variance': 39.266875000000006, 'ari_grade_level/v1/p90': 25.520000000000003, 'professionalism/v1/mean': 4.0, 'professionalism/v1/variance': 0.0, 'professionalism/v1/p90': 4.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    results = mlflow.evaluate(\n",
+    "        basic_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[professionalism_metric],  # use the professionalism metric we created above\n",
+    "    )\n",
+    "print(results.metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "486a7ee9-c557-4939-8ddc-bc282ecb4bc3",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ce07542c6bf444eaf5752609663cd32",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>professionalism/v1/score</th><th>professionalism/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>useEffect() is a hook in React that allows you to perform side effects in functional components. It is triggered after the component is rendered and whenever the component's dependencies change, allowing you to run code at specific points during the component's lifecycle.</td><td>49</td><td>22.7291488647</td><td>12.7</td><td>15.0</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the concept. It is appropriate for business or academic settings. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it doesn't score a 5.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The \"static\" keyword in a function means that the function exists only within the scope of the current file, and cannot be called from other files. It also means that the function's local variables retain their values between multiple invocations of the function.</td><td>50</td><td>15.9082307816</td><td>10.5</td><td>13.5</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear and concise, making it easy to understand. The use of quotation marks around the term \"static\" shows a level of formality and precision. The language is not excessively formal, so it does not reach a score of 5, but it is more formal than a balanced language, which would score a 3. Therefore, a score of 4 is appropriate.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>In Python, the 'finally' block is used to execute a set of statements whether an exception is raised or not, ensuring that certain code is always executed regardless of the outcome of a try-except block. It is commonly used for activities such as closing files or releasing resources that need to be done regardless of whether an exception occurred or not.</td><td>69</td><td>19.9835128784</td><td>14.0</td><td>16.7</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the 'finally' block in Python. The tone is appropriate for business or academic settings, hence the score of 4.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Multiprocessing involves running multiple processes simultaneously on multiple cores to increase efficiency and achieve parallelism, while multithreading involves running multiple threads within a single process, allowing for concurrent execution and sharing of resources.</td><td>40</td><td>19.108587265</td><td>24.4</td><td>29.3</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not merit a score of 5.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "useEffect() is a hook in React that allows you to perform side effects in functional components. It is triggered after the component is rendered and whenever the component's dependencies change, allowing you to run code at specific points during the component's lifecycle.",
+         49,
+         22.7291488647,
+         12.7,
+         15,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the concept. It is appropriate for business or academic settings. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it doesn't score a 5."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The \"static\" keyword in a function means that the function exists only within the scope of the current file, and cannot be called from other files. It also means that the function's local variables retain their values between multiple invocations of the function.",
+         50,
+         15.9082307816,
+         10.5,
+         13.5,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear and concise, making it easy to understand. The use of quotation marks around the term \"static\" shows a level of formality and precision. The language is not excessively formal, so it does not reach a score of 5, but it is more formal than a balanced language, which would score a 3. Therefore, a score of 4 is appropriate."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "In Python, the 'finally' block is used to execute a set of statements whether an exception is raised or not, ensuring that certain code is always executed regardless of the outcome of a try-except block. It is commonly used for activities such as closing files or releasing resources that need to be done regardless of whether an exception occurred or not.",
+         69,
+         19.9835128784,
+         14,
+         16.7,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and provides a clear, concise explanation of the 'finally' block in Python. The tone is appropriate for business or academic settings, hence the score of 4."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Multiprocessing involves running multiple processes simultaneously on multiple cores to increase efficiency and achieve parallelism, while multithreading involves running multiple threads within a single process, allowing for concurrent execution and sharing of resources.",
+         40,
+         19.108587265,
+         24.4,
+         29.3,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not merit a score of 5."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "52e9f69f-2f43-46ba-bf88-b4aebae741f4",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "The professionalism score of the `basic_qa_model` is not very good. Let's try to create a new model that can perform better"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "b4ea81e9-6e91-43e7-8539-8dab7b5f52de",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Call `mlflow.evaluate()` using the new model. Observe that the professionalism score has increased!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "5b21ef8f-50ef-4229-83c9-cc2251a081e2",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:41:57 WARNING mlflow.models.model: Model logged without a signature. Signatures will be required for upcoming model registry features as they validate model inputs and denote the expected schema of model outputs. Please visit https://www.mlflow.org/docs/2.7.2/models.html#set-signature-on-logged-model for instructions on setting a model signature on your logged model.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8714e76f6c9b40a2949a26cb871dfc36",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Uploading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1faffb78f9694b61b8c80759f474993d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:42:01 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Computing token count metric:\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Loading toxicity metric:\n",
+      "Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint\n",
+      "2023/10/19 22:42:22 WARNING mlflow.metrics.metric_definitions: Failed to load 'toxicity' metric (error: RuntimeError(\"Failed to import transformers.models.roberta.modeling_tf_roberta because of the following error (look up to see its traceback):\\nNo module named 'keras.engine'\")), skipping metric logging.\n",
+      "2023/10/19 22:42:22 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Loading perplexity metric:\n",
+      "2023/10/19 22:42:22 INFO mlflow.metrics.metric_definitions: Computing perplexity metric:\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1dcccbb05cef45a8b43bf484a856a337",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/19 22:42:27 INFO mlflow.metrics.metric_definitions: Computing flesch kincaid metric:\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/19 22:42:27 INFO mlflow.metrics.metric_definitions: Computing automated readability index metric:\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/19 22:42:27 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: professionalism\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'perplexity/v1/mean': 25.524279594421387, 'perplexity/v1/variance': 72.94819473072403, 'perplexity/v1/p90': 32.810652542114255, 'flesch_kincaid_grade_level/v1/mean': 16.525, 'flesch_kincaid_grade_level/v1/variance': 0.7818749999999987, 'flesch_kincaid_grade_level/v1/p90': 17.509999999999998, 'ari_grade_level/v1/mean': 19.125, 'ari_grade_level/v1/variance': 1.056875000000001, 'ari_grade_level/v1/p90': 20.23, 'professionalism/v1/mean': 4.5, 'professionalism/v1/variance': 0.25, 'professionalism/v1/p90': 5.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "with mlflow.start_run() as run:\n",
+    "    system_prompt = \"Answer the following question using extreme formality.\"\n",
+    "    professional_qa_model = mlflow.openai.log_model(\n",
+    "        model=\"gpt-3.5-turbo\",\n",
+    "        task=openai.ChatCompletion,\n",
+    "        artifact_path=\"model\",\n",
+    "        messages=[\n",
+    "            {\"role\": \"system\", \"content\": system_prompt},\n",
+    "            {\"role\": \"user\", \"content\": \"{question}\"},\n",
+    "        ],\n",
+    "    )\n",
+    "    results = mlflow.evaluate(\n",
+    "        professional_qa_model.model_uri,\n",
+    "        eval_df,\n",
+    "        model_type=\"question-answering\",\n",
+    "        evaluators=\"default\",\n",
+    "        extra_metrics=[professionalism_metric],\n",
+    "    )\n",
+    "print(results.metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "12027ba1-9d10-4f80-bb44-0857372a2e30",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1d5de8aa36b642128f68e89e214f4752",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<style scoped>\n",
+       "  .table-result-container {\n",
+       "    max-height: 300px;\n",
+       "    overflow: auto;\n",
+       "  }\n",
+       "  table, th, td {\n",
+       "    border: 1px solid black;\n",
+       "    border-collapse: collapse;\n",
+       "  }\n",
+       "  th, td {\n",
+       "    padding: 5px;\n",
+       "  }\n",
+       "  th {\n",
+       "    text-align: left;\n",
+       "  }\n",
+       "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>inputs</th><th>ground_truth</th><th>outputs</th><th>token_count</th><th>perplexity/v1/score</th><th>flesch_kincaid_grade_level/v1/score</th><th>ari_grade_level/v1/score</th><th>professionalism/v1/score</th><th>professionalism/v1/justification</th></tr></thead><tbody><tr><td>How does useEffect() work?</td><td>The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.</td><td>I would be delighted to expound upon the mechanics of the useEffect() function. useEffect() is a hook provided by the React library that facilitates the synchronization of side effects with the React components lifecycle method. It serves as a tool for performing actions that necessitate interaction with external entities, such as fetching data from an API or subscribing to events. The principal construct of useEffect() hinges on the concept of dependencies. By specifying an array of dependencies, which can consist of state variables, the useEffect() hook will execute the specified effect only when these dependencies undergo changes.\n",
+       "\n",
+       "Once the component mounts, useEffect() initially triggers the specified effect, consequently performing the designated side effect. During subsequent renders, useEffect() will carry out the cleanup operation, accommodating for the possibility of multiple effects being executed successively. Should any of the supplied dependencies undergo alteration, useEffect() will then reexecute the effect, ensuring synchronization with the altered state of the component.\n",
+       "\n",
+       "In essence, useEffect() effectively accommodates for the execution of side effects with accuracy and efficiency, aligning them with the React component lifecycle and facilitating the necessary synchronization with state updates.</td><td>220</td><td>25.9580955505</td><td>16.1</td><td>18.0</td><td>4</td><td>The language used in the output is formal and respectful, with no casual elements or slang. The explanation is clear, concise, and detailed, demonstrating a high level of professionalism. However, it does not reach the level of excessive formality that would be expected in the most formal settings such as textbooks, hence the score is not 5.</td></tr><tr><td>What does the static keyword in a function mean?</td><td>Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.</td><td>The static keyword, when applied to a function within the realm of computer programming, denotes a special attribute that imparts a significant alteration to the behavior of the said function. Specifically, the static keyword, operating in this context, restricts the visibility and scope of the function to the current translation unit, consequently prohibiting its accessibility from other external units of code. Consequently, the function annotated with the static modifier may solely be invoked within the same translation unit, and any attempts to access or execute it from an external module shall be rendered futile.</td><td>107</td><td>31.0698623657</td><td>17.9</td><td>20.8</td><td>5</td><td>The language used in the output is excessively formal and respectful. It avoids casual elements and is appropriate for the most formal settings such as textbooks. The response is clear, concise, and uses technical language appropriate for the subject matter. It avoids any slang or colloquialisms, and is tailored to an audience that understands the context of the question.</td></tr><tr><td>What does the 'finally' block in Python do?</td><td>'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.</td><td>The 'finally' block in Python encompasses a set of statements that are executed regardless of whether an exception occurred or not in the preceding 'try' and 'except' blocks. It serves as a reliable means to perform essential tasks, such as closing files or releasing resources, in order to uphold code integrity and adherent to best coding practices. Consequently, the 'finally' block guarantees the execution of these imperative actions, regardless of how the preceding logic unfolds, thus enhancing the robustness and fault tolerance of Python programs.</td><td>103</td><td>33.5567054749</td><td>15.5</td><td>18.9</td><td>4</td><td>The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not warrant a score of 5.</td></tr><tr><td>What is the difference between multiprocessing and multithreading?</td><td>Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.</td><td>Greetings,\n",
+       "\n",
+       "I hereby address your esteemed query regarding the distinction between multiprocessing and multithreading.\n",
+       "\n",
+       "In essence, multiprocessing and multithreading serve the purpose of enhancing the efficiency and effectiveness of simultaneous execution within a computer system. However, they differ considerably in terms of their operational principles and implementation.\n",
+       "\n",
+       "To begin with, multiprocessing refers to the deployment of multiple processors or central processing units (CPUs) to execute multiple tasks or processes concurrently. Each processor operates independently and can execute distinct tasks simultaneously, thereby realizing parallel processing. This architectural approach is particularly beneficial when it comes to computationally intensive applications, as it allows for increased speed and performance.\n",
+       "\n",
+       "On the other hand, multithreading entails the notion of creating multiple threads within a single process. A thread, as a lightweight unit of execution, constitutes a sequence of instructions that can be executed independently. By employing multiple threads, a program can concurrently carry out multiple tasks within a shared memory space. Notably, these threads share the same resources and context, including memory and file systems, which can lead to potential synchronization and communication challenges.\n",
+       "\n",
+       "In summary, multiprocessing leverages multiple processors or CPUs to execute distinct tasks in parallel, whereas multithreading enables the concurrent execution of multiple threads within a single process, sharing resources and memory space. The choice between multiprocessing and multithreading depends on the specific requirements, resources, and objectives of the system or application at hand.\n",
+       "\n",
+       "I trust that this elucidation satisfactorily addresses your inquiry with regards to the disparity between multiprocessing and multithreading. Should you have any further queries or necessitate additional clarification, I remain at your disposal.\n",
+       "\n",
+       "Yours respectfully,</td><td>324</td><td>11.5124549866</td><td>16.6</td><td>18.8</td><td>5</td><td>The response is written in an excessively formal and respectful tone. It uses formal greetings and sign-offs, avoids casual elements, and uses complex sentence structures. The language is appropriate for the most formal settings such as textbooks.</td></tr></tbody></table></div>"
+      ]
+     },
+     "metadata": {
+      "application/vnd.databricks.v1+output": {
+       "addedWidgets": {},
+       "aggData": [],
+       "aggError": "",
+       "aggOverflow": false,
+       "aggSchema": [],
+       "aggSeriesLimitReached": false,
+       "aggType": "",
+       "arguments": {},
+       "columnCustomDisplayInfos": {},
+       "data": [
+        [
+         "How does useEffect() work?",
+         "The useEffect() hook tells React that your component needs to do something after render. React will remember the function you passed (we’ll refer to it as our “effect”), and call it later after performing the DOM updates.",
+         "I would be delighted to expound upon the mechanics of the useEffect() function. useEffect() is a hook provided by the React library that facilitates the synchronization of side effects with the React components lifecycle method. It serves as a tool for performing actions that necessitate interaction with external entities, such as fetching data from an API or subscribing to events. The principal construct of useEffect() hinges on the concept of dependencies. By specifying an array of dependencies, which can consist of state variables, the useEffect() hook will execute the specified effect only when these dependencies undergo changes.\n\nOnce the component mounts, useEffect() initially triggers the specified effect, consequently performing the designated side effect. During subsequent renders, useEffect() will carry out the cleanup operation, accommodating for the possibility of multiple effects being executed successively. Should any of the supplied dependencies undergo alteration, useEffect() will then reexecute the effect, ensuring synchronization with the altered state of the component.\n\nIn essence, useEffect() effectively accommodates for the execution of side effects with accuracy and efficiency, aligning them with the React component lifecycle and facilitating the necessary synchronization with state updates.",
+         220,
+         25.9580955505,
+         16.1,
+         18,
+         4,
+         "The language used in the output is formal and respectful, with no casual elements or slang. The explanation is clear, concise, and detailed, demonstrating a high level of professionalism. However, it does not reach the level of excessive formality that would be expected in the most formal settings such as textbooks, hence the score is not 5."
+        ],
+        [
+         "What does the static keyword in a function mean?",
+         "Static members belongs to the class, rather than a specific instance. This means that only one instance of a static member exists, even if you create multiple objects of the class, or if you don't create any. It will be shared by all objects.",
+         "The static keyword, when applied to a function within the realm of computer programming, denotes a special attribute that imparts a significant alteration to the behavior of the said function. Specifically, the static keyword, operating in this context, restricts the visibility and scope of the function to the current translation unit, consequently prohibiting its accessibility from other external units of code. Consequently, the function annotated with the static modifier may solely be invoked within the same translation unit, and any attempts to access or execute it from an external module shall be rendered futile.",
+         107,
+         31.0698623657,
+         17.9,
+         20.8,
+         5,
+         "The language used in the output is excessively formal and respectful. It avoids casual elements and is appropriate for the most formal settings such as textbooks. The response is clear, concise, and uses technical language appropriate for the subject matter. It avoids any slang or colloquialisms, and is tailored to an audience that understands the context of the question."
+        ],
+        [
+         "What does the 'finally' block in Python do?",
+         "'Finally' defines a block of code to run when the try... except...else block is final. The finally block will be executed no matter if the try block raises an error or not.",
+         "The 'finally' block in Python encompasses a set of statements that are executed regardless of whether an exception occurred or not in the preceding 'try' and 'except' blocks. It serves as a reliable means to perform essential tasks, such as closing files or releasing resources, in order to uphold code integrity and adherent to best coding practices. Consequently, the 'finally' block guarantees the execution of these imperative actions, regardless of how the preceding logic unfolds, thus enhancing the robustness and fault tolerance of Python programs.",
+         103,
+         33.5567054749,
+         15.5,
+         18.9,
+         4,
+         "The language used in the output is formal and respectful. It avoids casual elements and is appropriate for business or academic settings. The explanation is clear, concise, and uses technical terms appropriately. However, it is not excessively formal to the point of being suitable for the most formal settings such as textbooks, hence it does not warrant a score of 5."
+        ],
+        [
+         "What is the difference between multiprocessing and multithreading?",
+         "Multithreading refers to the ability of a processor to execute multiple threads concurrently, where each thread runs a process. Whereas multiprocessing refers to the ability of a system to run multiple processors in parallel, where each processor can run one or more threads.",
+         "Greetings,\n\nI hereby address your esteemed query regarding the distinction between multiprocessing and multithreading.\n\nIn essence, multiprocessing and multithreading serve the purpose of enhancing the efficiency and effectiveness of simultaneous execution within a computer system. However, they differ considerably in terms of their operational principles and implementation.\n\nTo begin with, multiprocessing refers to the deployment of multiple processors or central processing units (CPUs) to execute multiple tasks or processes concurrently. Each processor operates independently and can execute distinct tasks simultaneously, thereby realizing parallel processing. This architectural approach is particularly beneficial when it comes to computationally intensive applications, as it allows for increased speed and performance.\n\nOn the other hand, multithreading entails the notion of creating multiple threads within a single process. A thread, as a lightweight unit of execution, constitutes a sequence of instructions that can be executed independently. By employing multiple threads, a program can concurrently carry out multiple tasks within a shared memory space. Notably, these threads share the same resources and context, including memory and file systems, which can lead to potential synchronization and communication challenges.\n\nIn summary, multiprocessing leverages multiple processors or CPUs to execute distinct tasks in parallel, whereas multithreading enables the concurrent execution of multiple threads within a single process, sharing resources and memory space. The choice between multiprocessing and multithreading depends on the specific requirements, resources, and objectives of the system or application at hand.\n\nI trust that this elucidation satisfactorily addresses your inquiry with regards to the disparity between multiprocessing and multithreading. Should you have any further queries or necessitate additional clarification, I remain at your disposal.\n\nYours respectfully,",
+         324,
+         11.5124549866,
+         16.6,
+         18.8,
+         5,
+         "The response is written in an excessively formal and respectful tone. It uses formal greetings and sign-offs, avoids casual elements, and uses complex sentence structures. The language is appropriate for the most formal settings such as textbooks."
+        ]
+       ],
+       "datasetInfos": [],
+       "dbfsResultPath": null,
+       "isJsonSchema": true,
+       "metadata": {},
+       "overflow": false,
+       "plotOptions": {
+        "customPlotOptions": {},
+        "displayType": "table",
+        "pivotAggregation": null,
+        "pivotColumns": null,
+        "xColumns": null,
+        "yColumns": null
+       },
+       "removedWidgets": [],
+       "schema": [
+        {
+         "metadata": "{}",
+         "name": "inputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ground_truth",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "outputs",
+         "type": "\"string\""
+        },
+        {
+         "metadata": "{}",
+         "name": "token_count",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "perplexity/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "flesch_kincaid_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "ari_grade_level/v1/score",
+         "type": "\"double\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/score",
+         "type": "\"long\""
+        },
+        {
+         "metadata": "{}",
+         "name": "professionalism/v1/justification",
+         "type": "\"string\""
+        }
+       ],
+       "type": "table"
+      }
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "e44bbe77-433a-4e03-a44e-d17eb6c06820",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "LLM Evaluation Examples -- QA",
+   "widgets": {}
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/evaluation/LLM Evaluation Examples -- RAG.ipynb b/examples/evaluation/LLM Evaluation Examples -- RAG.ipynb
new file mode 100644
index 0000000000000..9dfb44e79a8fe
--- /dev/null
+++ b/examples/evaluation/LLM Evaluation Examples -- RAG.ipynb	
@@ -0,0 +1,624 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "42084110-295b-493a-9b3e-5d8d29ff78b3",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# LLM RAG Evaluation with MLflow Example Notebook\n",
+    "\n",
+    "In this notebook, we will demonstrate how to evaluate various a RAG system with MLflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "bdff35e3-0e09-48b8-87ce-78759de88998",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Set OpenAI Key"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "bec25067-224d-4ee8-9b5d-0beeb6cde684",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = \"redacted\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "fb946228-62fb-4d68-9732-75935c9cb401",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "import mlflow"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "273d1345-95d7-435a-a7b6-a5f3dbb3f073",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Create a RAG system\n",
+    "\n",
+    "Use Langchain and Chroma to create a RAG system that answers questions based on the MLflow documentation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "2c28d0ad-f469-46ab-a2b4-c5e8db50a729",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.chains import RetrievalQA\n",
+    "from langchain.document_loaders import WebBaseLoader\n",
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.llms import OpenAI\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.vectorstores import Chroma"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "83a7e77e-6717-472a-86dc-02e2c356ddef",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "loader = WebBaseLoader(\"https://mlflow.org/docs/latest/index.html\")\n",
+    "\n",
+    "documents = loader.load()\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+    "texts = text_splitter.split_documents(documents)\n",
+    "\n",
+    "embeddings = OpenAIEmbeddings()\n",
+    "docsearch = Chroma.from_documents(texts, embeddings)\n",
+    "\n",
+    "qa = RetrievalQA.from_chain_type(\n",
+    "    llm=OpenAI(temperature=0),\n",
+    "    chain_type=\"stuff\",\n",
+    "    retriever=docsearch.as_retriever(),\n",
+    "    return_source_documents=True,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "fd70bcf6-7c44-44d3-9435-567b82611e1c",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "## Evaluate the RAG system using `mlflow.evaluate()`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "de1bc359-2e40-459c-bea4-bed35a117988",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a simple function that runs each input through the RAG chain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "667ec809-2bb5-4170-9937-6804386b41ec",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def model(input_df):\n",
+    "    answer = []\n",
+    "    for index, row in input_df.iterrows():\n",
+    "        answer.append(qa(row[\"questions\"]))\n",
+    "\n",
+    "    return answer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "d1064306-b7f3-4b3e-825c-4353d808f21d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create an eval dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "a5481491-e4a9-42ea-8a3f-f527faffd04d",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "eval_df = pd.DataFrame(\n",
+    "    {\n",
+    "        \"questions\": [\n",
+    "            \"What is MLflow?\",\n",
+    "            \"How to run Mlflow.evaluate()?\",\n",
+    "            \"How to log_table()?\",\n",
+    "            \"How to load_table()?\",\n",
+    "        ],\n",
+    "    }\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "9c3c8023-8feb-427a-b36d-34cd1853a5dc",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "Create a faithfulness metric"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "3882b940-9c25-41ce-a301-72d8c0c90aaa",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from mlflow.metrics.genai.metric_definitions import faithfulness\n",
+    "\n",
+    "faithfulness_metric = faithfulness(model=\"openai:/gpt-4\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "ea40ce52-6ac7-4c20-9669-d24f80a6cebe",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/23 13:13:16 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3\n",
+      "Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3\n",
+      "Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3\n",
+      "Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "23e9a5f58f1b4930ac47c88259156e1d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/23 13:13:41 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: token_count\n",
+      "2023/10/23 13:13:41 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: toxicity\n",
+      "2023/10/23 13:13:41 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: perplexity\n",
+      "Using pad_token, but it is not set yet.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2c6fd2067bad4404ad5550d56e23407e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023/10/23 13:13:44 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: flesch_kincaid_grade_level\n",
+      "2023/10/23 13:13:44 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: ari_grade_level\n",
+      "2023/10/23 13:13:44 INFO mlflow.models.evaluation.default_evaluator: Evaluating builtin metrics: exact_match\n",
+      "2023/10/23 13:13:44 INFO mlflow.models.evaluation.default_evaluator: Evaluating metrics: faithfulness\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'toxicity/v1/mean': 0.0002736186215770431, 'toxicity/v1/variance': 2.856656765360073e-08, 'toxicity/v1/p90': 0.0004570253004203551, 'toxicity/v1/ratio': 0.0, 'perplexity/v1/mean': 70.08646988868713, 'perplexity/v1/variance': 5233.465638493719, 'perplexity/v1/p90': 149.10144042968753, 'flesch_kincaid_grade_level/v1/mean': 7.625, 'flesch_kincaid_grade_level/v1/variance': 23.836875, 'flesch_kincaid_grade_level/v1/p90': 13.150000000000002, 'ari_grade_level/v1/mean': 9.450000000000001, 'ari_grade_level/v1/variance': 32.262499999999996, 'ari_grade_level/v1/p90': 15.870000000000001, 'faithfulness/v1/mean': 4.0, 'faithfulness/v1/variance': 3.0, 'faithfulness/v1/p90': 5.0}\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = mlflow.evaluate(\n",
+    "    model,\n",
+    "    eval_df,\n",
+    "    model_type=\"question-answering\",\n",
+    "    evaluators=\"default\",\n",
+    "    predictions=\"result\",\n",
+    "    extra_metrics=[faithfulness_metric, mlflow.metrics.latency()],\n",
+    "    evaluator_config={\n",
+    "        \"col_mapping\": {\n",
+    "            \"inputs\": \"questions\",\n",
+    "            \"context\": \"source_documents\",\n",
+    "        }\n",
+    "    },\n",
+    ")\n",
+    "print(results.metrics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "989a0861-5153-44e6-a19d-efcae7fe6cb5",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4a4883be06c94983a171da51d14b40a3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>questions</th>\n",
+       "      <th>outputs</th>\n",
+       "      <th>query</th>\n",
+       "      <th>source_documents</th>\n",
+       "      <th>latency</th>\n",
+       "      <th>token_count</th>\n",
+       "      <th>toxicity/v1/score</th>\n",
+       "      <th>perplexity/v1/score</th>\n",
+       "      <th>flesch_kincaid_grade_level/v1/score</th>\n",
+       "      <th>ari_grade_level/v1/score</th>\n",
+       "      <th>faithfulness/v1/score</th>\n",
+       "      <th>faithfulness/v1/justification</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>What is MLflow?</td>\n",
+       "      <td>MLflow is an open source platform for managin...</td>\n",
+       "      <td>What is MLflow?</td>\n",
+       "      <td>[{'lc_attributes': {}, 'lc_namespace': ['langc...</td>\n",
+       "      <td>3.970739</td>\n",
+       "      <td>176</td>\n",
+       "      <td>0.000208</td>\n",
+       "      <td>28.626591</td>\n",
+       "      <td>15.4</td>\n",
+       "      <td>18.9</td>\n",
+       "      <td>5</td>\n",
+       "      <td>The output provided by the model is a detailed...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>How to run Mlflow.evaluate()?</td>\n",
+       "      <td>\\n\\nYou can run Mlflow.evaluate() by using the...</td>\n",
+       "      <td>How to run Mlflow.evaluate()?</td>\n",
+       "      <td>[{'lc_attributes': {}, 'lc_namespace': ['langc...</td>\n",
+       "      <td>1.083653</td>\n",
+       "      <td>39</td>\n",
+       "      <td>0.000179</td>\n",
+       "      <td>44.533493</td>\n",
+       "      <td>4.7</td>\n",
+       "      <td>4.5</td>\n",
+       "      <td>5</td>\n",
+       "      <td>The output states that \"You can run Mlflow.eva...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How to log_table()?</td>\n",
+       "      <td>\\n\\nYou can use the log_table() function in ML...</td>\n",
+       "      <td>How to log_table()?</td>\n",
+       "      <td>[{'lc_attributes': {}, 'lc_namespace': ['langc...</td>\n",
+       "      <td>2.833117</td>\n",
+       "      <td>114</td>\n",
+       "      <td>0.000564</td>\n",
+       "      <td>13.269521</td>\n",
+       "      <td>7.9</td>\n",
+       "      <td>8.8</td>\n",
+       "      <td>1</td>\n",
+       "      <td>The output provides a detailed explanation of ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>How to load_table()?</td>\n",
+       "      <td>load_table() is not a function in MLflow.</td>\n",
+       "      <td>How to load_table()?</td>\n",
+       "      <td>[{'lc_attributes': {}, 'lc_namespace': ['langc...</td>\n",
+       "      <td>3.736170</td>\n",
+       "      <td>11</td>\n",
+       "      <td>0.000144</td>\n",
+       "      <td>193.916275</td>\n",
+       "      <td>2.5</td>\n",
+       "      <td>5.6</td>\n",
+       "      <td>5</td>\n",
+       "      <td>The output states that \"load_table() is not a ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       questions  \\\n",
+       "0                What is MLflow?   \n",
+       "1  How to run Mlflow.evaluate()?   \n",
+       "2            How to log_table()?   \n",
+       "3           How to load_table()?   \n",
+       "\n",
+       "                                             outputs  \\\n",
+       "0   MLflow is an open source platform for managin...   \n",
+       "1  \\n\\nYou can run Mlflow.evaluate() by using the...   \n",
+       "2  \\n\\nYou can use the log_table() function in ML...   \n",
+       "3          load_table() is not a function in MLflow.   \n",
+       "\n",
+       "                           query  \\\n",
+       "0                What is MLflow?   \n",
+       "1  How to run Mlflow.evaluate()?   \n",
+       "2            How to log_table()?   \n",
+       "3           How to load_table()?   \n",
+       "\n",
+       "                                    source_documents   latency  token_count  \\\n",
+       "0  [{'lc_attributes': {}, 'lc_namespace': ['langc...  3.970739          176   \n",
+       "1  [{'lc_attributes': {}, 'lc_namespace': ['langc...  1.083653           39   \n",
+       "2  [{'lc_attributes': {}, 'lc_namespace': ['langc...  2.833117          114   \n",
+       "3  [{'lc_attributes': {}, 'lc_namespace': ['langc...  3.736170           11   \n",
+       "\n",
+       "   toxicity/v1/score  perplexity/v1/score  \\\n",
+       "0           0.000208            28.626591   \n",
+       "1           0.000179            44.533493   \n",
+       "2           0.000564            13.269521   \n",
+       "3           0.000144           193.916275   \n",
+       "\n",
+       "   flesch_kincaid_grade_level/v1/score  ari_grade_level/v1/score  \\\n",
+       "0                                 15.4                      18.9   \n",
+       "1                                  4.7                       4.5   \n",
+       "2                                  7.9                       8.8   \n",
+       "3                                  2.5                       5.6   \n",
+       "\n",
+       "   faithfulness/v1/score                      faithfulness/v1/justification  \n",
+       "0                      5  The output provided by the model is a detailed...  \n",
+       "1                      5  The output states that \"You can run Mlflow.eva...  \n",
+       "2                      1  The output provides a detailed explanation of ...  \n",
+       "3                      5  The output states that \"load_table() is not a ...  "
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "results.tables[\"eval_results_table\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "LLM Evaluation Examples -- RAG",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": "mlflow-dev-env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.17"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/examples/evaluation/README.md b/examples/evaluation/README.md
index 8dcd8fe7a6f50..eaec69df35b88 100644
--- a/examples/evaluation/README.md
+++ b/examples/evaluation/README.md
@@ -2,7 +2,7 @@
 
 The examples in this directory demonstrate how to use the `mlflow.evaluate()` API. Specifically,
 they show how to evaluate a PyFunc model on a specified dataset using the builtin default evaluator
-and specified custom metrics, where the resulting metrics & artifacts are logged to MLflow Tracking.
+and specified extra metrics, where the resulting metrics & artifacts are logged to MLflow Tracking.
 They also show how to specify validation thresholds for the resulting metrics to validate the quality
 of your model. See full list of examples below:
 
@@ -18,7 +18,7 @@ of your model. See full list of examples below:
   with a comprehensive list of custom metric functions on dataset loaded by `sklearn.datasets.fetch_california_housing`
 - Example `evaluate_with_model_validation.py` trains both a candidate xgboost `XGBClassifier` model
   and a baseline `DummyClassifier` model on dataset loaded by `shap.datasets.adult`. Then, it validates
-  the candidate model against specified thresholds on both builtin and custom metrics and the dummy model.
+  the candidate model against specified thresholds on both builtin and extra metrics and the dummy model.
 
 #### Prerequisites
 
diff --git a/examples/evaluation/evaluate_with_llm_judge.py b/examples/evaluation/evaluate_with_llm_judge.py
index c87b484b71f4f..2a95ae21cf415 100644
--- a/examples/evaluation/evaluate_with_llm_judge.py
+++ b/examples/evaluation/evaluate_with_llm_judge.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 import mlflow
-from mlflow.metrics import EvaluationExample, correctness
+from mlflow.metrics import EvaluationExample, answer_similarity
 
 assert "OPENAI_API_KEY" in os.environ, "Please set the OPENAI_API_KEY environment variable."
 
@@ -27,7 +27,7 @@
     },
 )
 
-correctness_metric = correctness(examples=[example])
+answer_similarity_metric = answer_similarity(examples=[example])
 
 eval_df = pd.DataFrame(
     {
@@ -61,7 +61,7 @@
         eval_df,
         targets="ground_truth",
         model_type="question-answering",
-        extra_metrics=[correctness_metric],
+        extra_metrics=[answer_similarity_metric],
     )
     print(results)
 
diff --git a/examples/flower_classifier/image_pyfunc.py b/examples/flower_classifier/image_pyfunc.py
index bb8b43dbd3c27..ba77072f78078 100644
--- a/examples/flower_classifier/image_pyfunc.py
+++ b/examples/flower_classifier/image_pyfunc.py
@@ -173,7 +173,7 @@ def _load_pyfunc(path):
   - conda-forge
 dependencies:
   - python=={python_version}
-  - pip=={pip_version}  
+  - pip=={pip_version}
   - pip:
     - mlflow>=1.6
     - pillow=={pillow_version}
diff --git a/examples/multistep_workflow/als.py b/examples/multistep_workflow/als.py
index 0bdcb781a668a..b6fbd83ca04f5 100644
--- a/examples/multistep_workflow/als.py
+++ b/examples/multistep_workflow/als.py
@@ -23,45 +23,46 @@
 def train_als(ratings_data, split_prop, max_iter, reg_param, rank, cold_start_strategy):
     seed = 42
 
-    spark = pyspark.sql.SparkSession.builder.getOrCreate()
+    with pyspark.sql.SparkSession.builder.getOrCreate() as spark:
+        ratings_df = spark.read.parquet(ratings_data)
+        (training_df, test_df) = ratings_df.randomSplit([split_prop, 1 - split_prop], seed=seed)
+        training_df.cache()
+        test_df.cache()
 
-    ratings_df = spark.read.parquet(ratings_data)
-    (training_df, test_df) = ratings_df.randomSplit([split_prop, 1 - split_prop], seed=seed)
-    training_df.cache()
-    test_df.cache()
+        mlflow.log_metric("training_nrows", training_df.count())
+        mlflow.log_metric("test_nrows", test_df.count())
 
-    mlflow.log_metric("training_nrows", training_df.count())
-    mlflow.log_metric("test_nrows", test_df.count())
+        print(f"Training: {training_df.count()}, test: {test_df.count()}")
 
-    print(f"Training: {training_df.count()}, test: {test_df.count()}")
+        als = (
+            ALS()
+            .setUserCol("userId")
+            .setItemCol("movieId")
+            .setRatingCol("rating")
+            .setPredictionCol("predictions")
+            .setMaxIter(max_iter)
+            .setSeed(seed)
+            .setRegParam(reg_param)
+            .setColdStartStrategy(cold_start_strategy)
+            .setRank(rank)
+        )
 
-    als = (
-        ALS()
-        .setUserCol("userId")
-        .setItemCol("movieId")
-        .setRatingCol("rating")
-        .setPredictionCol("predictions")
-        .setMaxIter(max_iter)
-        .setSeed(seed)
-        .setRegParam(reg_param)
-        .setColdStartStrategy(cold_start_strategy)
-        .setRank(rank)
-    )
+        als_model = Pipeline(stages=[als]).fit(training_df)
 
-    als_model = Pipeline(stages=[als]).fit(training_df)
+        reg_eval = RegressionEvaluator(
+            predictionCol="predictions", labelCol="rating", metricName="mse"
+        )
 
-    reg_eval = RegressionEvaluator(predictionCol="predictions", labelCol="rating", metricName="mse")
+        predicted_test_dF = als_model.transform(test_df)
 
-    predicted_test_dF = als_model.transform(test_df)
+        test_mse = reg_eval.evaluate(predicted_test_dF)
+        train_mse = reg_eval.evaluate(als_model.transform(training_df))
 
-    test_mse = reg_eval.evaluate(predicted_test_dF)
-    train_mse = reg_eval.evaluate(als_model.transform(training_df))
-
-    print(f"The model had a MSE on the test set of {test_mse}")
-    print(f"The model had a MSE on the (train) set of {train_mse}")
-    mlflow.log_metric("test_mse", test_mse)
-    mlflow.log_metric("train_mse", train_mse)
-    mlflow.spark.log_model(als_model, "als-model")
+        print(f"The model had a MSE on the test set of {test_mse}")
+        print(f"The model had a MSE on the (train) set of {train_mse}")
+        mlflow.log_metric("test_mse", test_mse)
+        mlflow.log_metric("train_mse", train_mse)
+        mlflow.spark.log_model(als_model, "als-model")
 
 
 if __name__ == "__main__":
diff --git a/examples/multistep_workflow/etl_data.py b/examples/multistep_workflow/etl_data.py
index 2dda4756831e7..eda48e459b009 100644
--- a/examples/multistep_workflow/etl_data.py
+++ b/examples/multistep_workflow/etl_data.py
@@ -22,20 +22,20 @@ def etl_data(ratings_csv, max_row_limit):
     with mlflow.start_run():
         tmpdir = tempfile.mkdtemp()
         ratings_parquet_dir = os.path.join(tmpdir, "ratings-parquet")
-        spark = pyspark.sql.SparkSession.builder.getOrCreate()
         print(f"Converting ratings CSV {ratings_csv} to Parquet {ratings_parquet_dir}")
-        ratings_df = (
-            spark.read.option("header", "true")
-            .option("inferSchema", "true")
-            .csv(ratings_csv)
-            .drop("timestamp")
-        )  # Drop unused column
-        ratings_df.show()
-        if max_row_limit != -1:
-            ratings_df = ratings_df.limit(max_row_limit)
-        ratings_df.write.parquet(ratings_parquet_dir)
-        print(f"Uploading Parquet ratings: {ratings_parquet_dir}")
-        mlflow.log_artifacts(ratings_parquet_dir, "ratings-parquet-dir")
+        with pyspark.sql.SparkSession.builder.getOrCreate() as spark:
+            ratings_df = (
+                spark.read.option("header", "true")
+                .option("inferSchema", "true")
+                .csv(ratings_csv)
+                .drop("timestamp")
+            )  # Drop unused column
+            ratings_df.show()
+            if max_row_limit != -1:
+                ratings_df = ratings_df.limit(max_row_limit)
+            ratings_df.write.parquet(ratings_parquet_dir)
+            print(f"Uploading Parquet ratings: {ratings_parquet_dir}")
+            mlflow.log_artifacts(ratings_parquet_dir, "ratings-parquet-dir")
 
 
 if __name__ == "__main__":
diff --git a/examples/multistep_workflow/train_keras.py b/examples/multistep_workflow/train_keras.py
index d59fabc5975cd..7f2c091c509a3 100644
--- a/examples/multistep_workflow/train_keras.py
+++ b/examples/multistep_workflow/train_keras.py
@@ -29,85 +29,87 @@ def train_keras(ratings_data, als_model_uri, hidden_units):
     np.random.seed(0)
     tf.set_random_seed(42)  # For reproducibility
 
-    spark = pyspark.sql.SparkSession.builder.getOrCreate()
-    als_model = mlflow.spark.load_model(als_model_uri).stages[0]
-
-    ratings_df = spark.read.parquet(ratings_data)
-
-    (training_df, test_df) = ratings_df.randomSplit([0.8, 0.2], seed=42)
-    training_df.cache()
-    test_df.cache()
-
-    mlflow.log_metric("training_nrows", training_df.count())
-    mlflow.log_metric("test_nrows", test_df.count())
-
-    print(f"Training: {training_df.count()}, test: {test_df.count()}")
-
-    user_factors = als_model.userFactors.selectExpr("id as userId", "features as uFeatures")
-    item_factors = als_model.itemFactors.selectExpr("id as movieId", "features as iFeatures")
-    joined_train_df = training_df.join(item_factors, on="movieId").join(user_factors, on="userId")
-    joined_test_df = test_df.join(item_factors, on="movieId").join(user_factors, on="userId")
-
-    # We'll combine the movies and ratings vectors into a single vector of length 24.
-    # We will then explode this features vector into a set of columns.
-    def concat_arrays(*args):
-        return list(chain(*args))
-
-    concat_arrays_udf = udf(concat_arrays, ArrayType(FloatType()))
-
-    concat_train_df = joined_train_df.select(
-        "userId",
-        "movieId",
-        concat_arrays_udf(col("iFeatures"), col("uFeatures")).alias("features"),
-        col("rating").cast("float"),
-    )
-    concat_test_df = joined_test_df.select(
-        "userId",
-        "movieId",
-        concat_arrays_udf(col("iFeatures"), col("uFeatures")).alias("features"),
-        col("rating").cast("float"),
-    )
-
-    pandas_df = concat_train_df.toPandas()
-    pandas_test_df = concat_test_df.toPandas()
-
-    # This syntax will create a new DataFrame where elements of the 'features' vector
-    # are each in their own column. This is what we'll train our neural network on.
-    x_test = pd.DataFrame(pandas_test_df.features.values.tolist(), index=pandas_test_df.index)
-    x_train = pd.DataFrame(pandas_df.features.values.tolist(), index=pandas_df.index)
-
-    # Show matrix for example.
-    print("Training matrix:")
-    print(x_train)
-
-    # Create our Keras model with two fully connected hidden layers.
-    model = Sequential()
-    model.add(Dense(30, input_dim=24, activation="relu"))
-    model.add(Dense(hidden_units, activation="relu"))
-    model.add(Dense(1, activation="linear"))
-
-    model.compile(loss="mse", optimizer=keras.optimizers.Adam(lr=0.0001))
-
-    early_stopping = EarlyStopping(monitor="val_loss", min_delta=0.0001, patience=2, mode="auto")
-
-    model.fit(
-        x_train,
-        pandas_df["rating"],
-        validation_split=0.2,
-        verbose=2,
-        epochs=3,
-        batch_size=128,
-        shuffle=False,
-        callbacks=[early_stopping],
-    )
-
-    train_mse = model.evaluate(x_train, pandas_df["rating"], verbose=2)
-    test_mse = model.evaluate(x_test, pandas_test_df["rating"], verbose=2)
-    mlflow.log_metric("test_mse", test_mse)
-    mlflow.log_metric("train_mse", train_mse)
-
-    print(f"The model had a MSE on the test set of {test_mse}")
-    mlflow.tensorflow.log_model(model, "keras-model")
+    with pyspark.sql.SparkSession.builder.getOrCreate() as spark:
+        als_model = mlflow.spark.load_model(als_model_uri).stages[0]
+        ratings_df = spark.read.parquet(ratings_data)
+        (training_df, test_df) = ratings_df.randomSplit([0.8, 0.2], seed=42)
+        training_df.cache()
+        test_df.cache()
+
+        mlflow.log_metric("training_nrows", training_df.count())
+        mlflow.log_metric("test_nrows", test_df.count())
+
+        print(f"Training: {training_df.count()}, test: {test_df.count()}")
+
+        user_factors = als_model.userFactors.selectExpr("id as userId", "features as uFeatures")
+        item_factors = als_model.itemFactors.selectExpr("id as movieId", "features as iFeatures")
+        joined_train_df = training_df.join(item_factors, on="movieId").join(
+            user_factors, on="userId"
+        )
+        joined_test_df = test_df.join(item_factors, on="movieId").join(user_factors, on="userId")
+
+        # We'll combine the movies and ratings vectors into a single vector of length 24.
+        # We will then explode this features vector into a set of columns.
+        def concat_arrays(*args):
+            return list(chain(*args))
+
+        concat_arrays_udf = udf(concat_arrays, ArrayType(FloatType()))
+
+        concat_train_df = joined_train_df.select(
+            "userId",
+            "movieId",
+            concat_arrays_udf(col("iFeatures"), col("uFeatures")).alias("features"),
+            col("rating").cast("float"),
+        )
+        concat_test_df = joined_test_df.select(
+            "userId",
+            "movieId",
+            concat_arrays_udf(col("iFeatures"), col("uFeatures")).alias("features"),
+            col("rating").cast("float"),
+        )
+
+        pandas_df = concat_train_df.toPandas()
+        pandas_test_df = concat_test_df.toPandas()
+
+        # This syntax will create a new DataFrame where elements of the 'features' vector
+        # are each in their own column. This is what we'll train our neural network on.
+        x_test = pd.DataFrame(pandas_test_df.features.values.tolist(), index=pandas_test_df.index)
+        x_train = pd.DataFrame(pandas_df.features.values.tolist(), index=pandas_df.index)
+
+        # Show matrix for example.
+        print("Training matrix:")
+        print(x_train)
+
+        # Create our Keras model with two fully connected hidden layers.
+        model = Sequential()
+        model.add(Dense(30, input_dim=24, activation="relu"))
+        model.add(Dense(hidden_units, activation="relu"))
+        model.add(Dense(1, activation="linear"))
+
+        model.compile(loss="mse", optimizer=keras.optimizers.Adam(lr=0.0001))
+
+        early_stopping = EarlyStopping(
+            monitor="val_loss", min_delta=0.0001, patience=2, mode="auto"
+        )
+
+        model.fit(
+            x_train,
+            pandas_df["rating"],
+            validation_split=0.2,
+            verbose=2,
+            epochs=3,
+            batch_size=128,
+            shuffle=False,
+            callbacks=[early_stopping],
+        )
+
+        train_mse = model.evaluate(x_train, pandas_df["rating"], verbose=2)
+        test_mse = model.evaluate(x_test, pandas_test_df["rating"], verbose=2)
+        mlflow.log_metric("test_mse", test_mse)
+        mlflow.log_metric("train_mse", train_mse)
+
+        print(f"The model had a MSE on the test set of {test_mse}")
+        mlflow.tensorflow.log_model(model, "keras-model")
 
 
 if __name__ == "__main__":
diff --git a/examples/openai/pyfunc.py b/examples/openai/chat_completions.py
similarity index 77%
rename from examples/openai/pyfunc.py
rename to examples/openai/chat_completions.py
index 3e65c5315fe7f..6482725bdc1cb 100644
--- a/examples/openai/pyfunc.py
+++ b/examples/openai/chat_completions.py
@@ -5,6 +5,8 @@
 import pandas as pd
 
 import mlflow
+from mlflow.models.signature import ModelSignature
+from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema
 
 logging.getLogger("mlflow").setLevel(logging.ERROR)
 
@@ -159,3 +161,40 @@
 ]
 model = mlflow.pyfunc.load_model(model_info.model_uri)
 print(model.predict(list_of_strings))
+
+
+print(
+    """
+# ******************************************************************************
+# Inference parameters with chat completions
+# ******************************************************************************
+"""
+)
+with mlflow.start_run():
+    model_info = mlflow.openai.log_model(
+        model="gpt-3.5-turbo",
+        task=openai.ChatCompletion,
+        artifact_path="model",
+        messages=[{"role": "user", "content": "Tell me a joke about {animal}."}],
+        signature=ModelSignature(
+            inputs=Schema([ColSpec(type="string", name=None)]),
+            outputs=Schema([ColSpec(type="string", name=None)]),
+            params=ParamSchema(
+                [
+                    ParamSpec(name="temperature", default=0, dtype="float"),
+                ]
+            ),
+        ),
+    )
+
+
+model = mlflow.pyfunc.load_model(model_info.model_uri)
+df = pd.DataFrame(
+    {
+        "animal": [
+            "cats",
+            "dogs",
+        ]
+    }
+)
+print(model.predict(df, params={"temperature": 1}))
diff --git a/examples/openai/completions.py b/examples/openai/completions.py
index c0b5128bb40f9..5cf61bda40234 100644
--- a/examples/openai/completions.py
+++ b/examples/openai/completions.py
@@ -3,9 +3,18 @@
 import openai
 
 import mlflow
+from mlflow.models.signature import ModelSignature
+from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema
 
 assert "OPENAI_API_KEY" in os.environ, " OPENAI_API_KEY environment variable must be set"
 
+print(
+    """
+# ******************************************************************************
+# Completions indicating prompt template
+# ******************************************************************************
+"""
+)
 
 with mlflow.start_run():
     model_info = mlflow.openai.log_model(
@@ -17,3 +26,33 @@
 
 model = mlflow.pyfunc.load_model(model_info.model_uri)
 print(model.predict(["I believe in a better world"]))
+
+
+print(
+    """
+# ******************************************************************************
+# Completions using inference parameters
+# ******************************************************************************
+"""
+)
+with mlflow.start_run():
+    model_info = mlflow.openai.log_model(
+        model="text-davinci-002",
+        task=openai.Completion,
+        artifact_path="model",
+        prompt="Clasify the following tweet's sentiment: '{tweet}'.",
+        signature=ModelSignature(
+            inputs=Schema([ColSpec(type="string", name=None)]),
+            outputs=Schema([ColSpec(type="string", name=None)]),
+            params=ParamSchema(
+                [
+                    ParamSpec(name="max_tokens", default=16, dtype="long"),
+                    ParamSpec(name="temperature", default=0, dtype="float"),
+                    ParamSpec(name="best_of", default=1, dtype="long"),
+                ]
+            ),
+        ),
+    )
+
+model = mlflow.pyfunc.load_model(model_info.model_uri)
+print(model.predict(["I believe in a better world"], params={"temperature": 1, "best_of": 5}))
diff --git a/examples/openai/embeddings.py b/examples/openai/embeddings.py
index 63c3090392524..9020124a0d18c 100644
--- a/examples/openai/embeddings.py
+++ b/examples/openai/embeddings.py
@@ -1,12 +1,23 @@
 import os
 
+import numpy as np
 import openai
 
 import mlflow
+from mlflow.models.signature import ModelSignature
+from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema, TensorSpec
 
 assert "OPENAI_API_KEY" in os.environ, " OPENAI_API_KEY environment variable must be set"
 
 
+print(
+    """
+# ******************************************************************************
+# Text embeddings
+# ******************************************************************************
+"""
+)
+
 with mlflow.start_run():
     model_info = mlflow.openai.log_model(
         model="text-embedding-ada-002",
@@ -16,3 +27,27 @@
 
 model = mlflow.pyfunc.load_model(model_info.model_uri)
 print(model.predict(["hello", "world"]))
+
+
+print(
+    """
+# ******************************************************************************
+# Text embeddings with batch_size parameter
+# ******************************************************************************
+"""
+)
+
+with mlflow.start_run():
+    mlflow.openai.log_model(
+        model="text-embedding-ada-002",
+        task=openai.Embedding,
+        artifact_path="model",
+        signature=ModelSignature(
+            inputs=Schema([ColSpec(type="string", name=None)]),
+            outputs=Schema([TensorSpec(type=np.dtype("float64"), shape=(-1,))]),
+            params=ParamSchema([ParamSpec(name="batch_size", dtype="long", default=1024)]),
+        ),
+    )
+
+model = mlflow.pyfunc.load_model(model_info.model_uri)
+print(model.predict(["hello", "world"], params={"batch_size": 16}))
diff --git a/examples/sentence_transformers/simple.py b/examples/sentence_transformers/simple.py
index 3993cba2f0628..f303362f83bbc 100644
--- a/examples/sentence_transformers/simple.py
+++ b/examples/sentence_transformers/simple.py
@@ -38,5 +38,5 @@
   2.37922110e-02 -2.28897743e-02  3.89375277e-02  3.02067865e-02]
  [ 4.81191138e-03 -9.33756605e-02  6.95968643e-02  8.09735525e-03
   ...
-   6.57437667e-02 -2.72239652e-02  4.02687863e-02 -1.05599344e-01]] 
+   6.57437667e-02 -2.72239652e-02  4.02687863e-02 -1.05599344e-01]]
 """
diff --git a/mlflow/R/mlflow/R/databricks-utils.R b/mlflow/R/mlflow/R/databricks-utils.R
index aeb5a79a9a67c..5f1139ed5e27a 100644
--- a/mlflow/R/mlflow/R/databricks-utils.R
+++ b/mlflow/R/mlflow/R/databricks-utils.R
@@ -75,27 +75,48 @@ get_databricks_config_from_env <- function() {
 }
 
 get_databricks_config <- function(profile) {
-  config <- if (!is.na(profile)) {
-    get_databricks_config_for_profile(profile)
-  } else if (exists("spark.databricks.token") && exists("spark.databricks.api.url")) {
+
+  # If a profile is provided, fetch its configuration
+  if (!is.na(profile)) {
+    config <- get_databricks_config_for_profile(profile)
+    if (databricks_config_is_valid(config)) {
+      return(config)
+    }
+  }
+
+  # Check for environment variables
+  config <- get_databricks_config_from_env()
+  if (databricks_config_is_valid(config)) {
+    return(config)
+  }
+
+  # Check 'DEFAULT' profile
+  config <- tryCatch({
+    get_databricks_config_for_profile("DEFAULT")
+  }, error = function(e) {
+    # On error assume known invalid config
+    list(host = NA, token = NA, username = NA, password = NA)
+  })
+  if (databricks_config_is_valid(config)) {
+    return(config)
+  }
+
+  # When in Databricks (done last so other methods are explicit overrides)
+  if (exists("spark.databricks.token", envir = .GlobalEnv) &&
+      exists("spark.databricks.api.url", envir = .GlobalEnv)) {
     config_vars <- list(
       host = get("spark.databricks.api.url", envir = .GlobalEnv),
       token = get("spark.databricks.token", envir = .GlobalEnv),
       insecure = Sys.getenv(config_variable_map$insecure, "False")
     )
-    new_databricks_config(config_source = "db_dynamic", config_vars = config_vars)
-  } else {
-    config <- get_databricks_config_from_env()
+    config <- new_databricks_config(config_source = "db_dynamic", config_vars = config_vars)
     if (databricks_config_is_valid(config)) {
-      config
-    } else {
-      get_databricks_config_for_profile("DEFAULT")
+      return(config)
     }
   }
-  if (!databricks_config_is_valid(config)) {
-    stop("Could not find valid Databricks configuration.")
-  }
-  config
+
+  # If no valid configuration is found by this point, raise an error
+  stop("Could not find valid Databricks configuration.")
 }
 
 #' Get information from Databricks Notebook environment
diff --git a/mlflow/R/mlflow/R/tracking-client.R b/mlflow/R/mlflow/R/tracking-client.R
index 56461a42961fd..4c0d8aa7157d3 100644
--- a/mlflow/R/mlflow/R/tracking-client.R
+++ b/mlflow/R/mlflow/R/tracking-client.R
@@ -28,7 +28,7 @@ new_mlflow_client_impl <- function(get_host_creds, get_cli_env = list, class = c
   )
 }
 
-new_mlflow_host_creds <- function( host = NA, username = NA, password = NA, token = NA,
+new_mlflow_host_creds <- function(host = NA, username = NA, password = NA, token = NA,
                                    insecure = "False") {
   insecure_arg <- if (is.null(insecure) || is.na(insecure)) {
     "False"
diff --git a/mlflow/__init__.py b/mlflow/__init__.py
index 8f57ba5e66872..5b0c1689f5716 100644
--- a/mlflow/__init__.py
+++ b/mlflow/__init__.py
@@ -148,6 +148,7 @@
     set_tags,
     start_run,
 )
+from mlflow.utils.async_logging.run_operations import RunOperations  # noqa: F401
 from mlflow.utils.credentials import login
 
 __all__ = [
diff --git a/mlflow/data/digest_utils.py b/mlflow/data/digest_utils.py
index 692a8ad199a5b..bbaa02dc20041 100644
--- a/mlflow/data/digest_utils.py
+++ b/mlflow/data/digest_utils.py
@@ -1,10 +1,10 @@
-import hashlib
 from typing import Any, List
 
 from packaging.version import Version
 
 from mlflow.exceptions import MlflowException
 from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
+from mlflow.utils import insecure_hash
 
 MAX_ROWS = 10000
 
@@ -159,7 +159,7 @@ def get_normalized_md5_digest(elements: List[Any]) -> str:
             INVALID_PARAMETER_VALUE,
         )
 
-    md5 = hashlib.md5()
+    md5 = insecure_hash.md5()
     for element in elements:
         md5.update(element)
 
diff --git a/mlflow/data/pandas_dataset.py b/mlflow/data/pandas_dataset.py
index d68afbbd6c54f..3c765168d169a 100644
--- a/mlflow/data/pandas_dataset.py
+++ b/mlflow/data/pandas_dataset.py
@@ -173,7 +173,7 @@ def from_pandas(
 ) -> PandasDataset:
     """
     Constructs a :py:class:`PandasDataset <mlflow.data.pandas_dataset.PandasDataset>` instance from
-    a Pandas DataFrame, optional targets, and source.
+    a Pandas DataFrame, optional targets, optional predictions, and source.
 
     :param df: A Pandas DataFrame.
     :param source: The source from which the DataFrame was derived, e.g. a filesystem
@@ -200,10 +200,10 @@ def from_pandas(
         import pandas as pd
 
         x = pd.DataFrame(
-            [["tom", 10, 1], ["nick", 15, 0], ["juli", 14, 1]],
-            columns=["Name", "Age", "Label"],
+            [["tom", 10, 1, 1], ["nick", 15, 0, 1], ["juli", 14, 1, 1]],
+            columns=["Name", "Age", "Label", "ModelOutput"],
         )
-        dataset = mlflow.data.from_pandas(x, targets="Label")
+        dataset = mlflow.data.from_pandas(x, targets="Label", predictions="ModelOutput")
     """
     from mlflow.data.code_dataset_source import CodeDatasetSource
     from mlflow.data.dataset_source_registry import resolve_dataset_source
diff --git a/mlflow/langchain/__init__.py b/mlflow/langchain/__init__.py
index e68817cabe87b..76c234d957a83 100644
--- a/mlflow/langchain/__init__.py
+++ b/mlflow/langchain/__init__.py
@@ -571,8 +571,14 @@ def _save_model(model, path, loader_fn, persist_dir):
 
         if model.tools:
             tools_data_path = os.path.join(path, _TOOLS_DATA_FILE_NAME)
-            with open(tools_data_path, "wb") as f:
-                cloudpickle.dump(model.tools, f)
+            try:
+                with open(tools_data_path, "wb") as f:
+                    cloudpickle.dump(model.tools, f)
+            except Exception as e:
+                raise mlflow.MlflowException(
+                    "Error when attempting to pickle the AgentExecutor tools. "
+                    "This model likely does not support serialization."
+                ) from e
             model_data_kwargs[_TOOLS_DATA_KEY] = _TOOLS_DATA_FILE_NAME
         else:
             raise mlflow.MlflowException.invalid_parameter_value(
diff --git a/mlflow/metrics/__init__.py b/mlflow/metrics/__init__.py
index 12a167f6f0a3c..6791b02650040 100644
--- a/mlflow/metrics/__init__.py
+++ b/mlflow/metrics/__init__.py
@@ -6,9 +6,10 @@
     make_genai_metric,
 )
 from mlflow.metrics.genai.metric_definitions import (
-    correctness,
-    relevance,
-    strict_correctness,
+    answer_correctness,
+    answer_relevance,
+    answer_similarity,
+    faithfulness,
 )
 from mlflow.metrics.metric_definitions import (
     _accuracy_eval_fn,
@@ -19,7 +20,7 @@
     _mape_eval_fn,
     _max_error_eval_fn,
     _mse_eval_fn,
-    _perplexity_eval_fn,
+    _precision_at_k_eval_fn,
     _precision_eval_fn,
     _r2_score_eval_fn,
     _recall_eval_fn,
@@ -35,340 +36,395 @@
     EvaluationMetric,
     make_metric,
 )
+from mlflow.utils.annotations import experimental
 
-latency = make_metric(
-    eval_fn=lambda x: MetricValue(),
-    greater_is_better=False,
-    name="latency",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for calculating latency. Latency is determined by the time it takes to generate a
-prediction for a given input. Note that computing latency requires each row to be predicted 
-sequentially, which will likely slow down the evaluation process. 
-"""
-
-# general text metrics
-token_count = make_metric(
-    eval_fn=_token_count_eval_fn,
-    greater_is_better=True,
-    name="token_count",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for calculating token_count. Token count is calculated using tiktoken by using the 
-`cl100k_base` tokenizer.
-"""
-
-toxicity = make_metric(
-    eval_fn=_toxicity_eval_fn,
-    greater_is_better=False,
-    name="toxicity",
-    long_name="toxicity/roberta-hate-speech-dynabench-r4",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for evaluating `toxicity`_ using the model `roberta-hate-speech-dynabench-r4`_, 
-which defines hate as "abusive speech targeting specific group characteristics, such as 
-ethnic origin, religion, gender, or sexual orientation."
-
-The score ranges from 0 to 1, where scores closer to 1 are more toxic. The default threshold 
-for a text to be considered "toxic" is 0.5.
-
-Aggregations calculated for this metric:
-    - ratio (of toxic input texts)
-
-.. _toxicity: https://huggingface.co/spaces/evaluate-measurement/toxicity
-.. _roberta-hate-speech-dynabench-r4: https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target
-"""
 
-perplexity = make_metric(
-    eval_fn=_perplexity_eval_fn,
-    greater_is_better=False,
-    name="perplexity",
-    long_name="perplexity/gpt2",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for evaluating `perplexity`_ using the model gpt2.
-
-The score ranges from 0 to infinity, where a lower score means that the model is better at 
-predicting the given text and a higher score means that the model is not likely to predict the text.
-
-Aggregations calculated for this metric:
-    - mean
-
-.. _perplexity: https://huggingface.co/spaces/evaluate-metric/perplexity
-"""
+@experimental
+def latency() -> EvaluationMetric:
+    """
+    This function will create a metric for calculating latency. Latency is determined by the time
+    it takes to generate a prediction for a given input. Note that computing latency requires
+    each row to be predicted sequentially, which will likely slow down the evaluation process.
+    """
+    return make_metric(
+        eval_fn=lambda x: MetricValue(),
+        greater_is_better=False,
+        name="latency",
+    )
 
-flesch_kincaid_grade_level = make_metric(
-    eval_fn=_flesch_kincaid_eval_fn,
-    greater_is_better=False,
-    name="flesch_kincaid_grade_level",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for calculating `flesch kincaid grade level`_ using `textstat`_.
-    
-This metric outputs a number that approximates the grade level needed to comprehend the text, which
-will likely range from around 0 to 15 (although it is not limited to this range).
-
-Aggregations calculated for this metric:
-    - mean
-
-.. _flesch kincaid grade level:
-    https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
-.. _textstat: https://pypi.org/project/textstat/
-"""
-
-ari_grade_level = make_metric(
-    eval_fn=_ari_eval_fn,
-    greater_is_better=False,
-    name="ari_grade_level",
-    long_name="automated_readability_index_grade_level",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
-
-A metric for calculating `automated readability index`_ using `textstat`_.
-    
-This metric outputs a number that approximates the grade level needed to comprehend the text, which
-will likely range from around 0 to 15 (although it is not limited to this range).
 
-Aggregations calculated for this metric:
-    - mean
+# general text metrics
+@experimental
+def token_count() -> EvaluationMetric:
+    """
+    This function will create a metric for calculating token_count. Token count is calculated
+    using tiktoken by using the `cl100k_base` tokenizer.
+    """
+    return make_metric(
+        eval_fn=_token_count_eval_fn,
+        greater_is_better=True,
+        name="token_count",
+    )
+
+
+@experimental
+def toxicity() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `toxicity`_ using the model
+    `roberta-hate-speech-dynabench-r4`_, which defines hate as "abusive speech targeting
+    specific group characteristics, such as ethnic origin, religion, gender, or sexual
+    orientation."
+
+    The score ranges from 0 to 1, where scores closer to 1 are more toxic. The default threshold
+    for a text to be considered "toxic" is 0.5.
+
+    Aggregations calculated for this metric:
+        - ratio (of toxic input texts)
+
+    .. _toxicity: https://huggingface.co/spaces/evaluate-measurement/toxicity
+    .. _roberta-hate-speech-dynabench-r4: https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target
+    """
+    return make_metric(
+        eval_fn=_toxicity_eval_fn,
+        greater_is_better=False,
+        name="toxicity",
+        long_name="toxicity/roberta-hate-speech-dynabench-r4",
+        version="v1",
+    )
+
+
+@experimental
+def flesch_kincaid_grade_level() -> EvaluationMetric:
+    """
+    This function will create a metric for calculating `flesch kincaid grade level`_ using
+    `textstat`_.
+
+    This metric outputs a number that approximates the grade level needed to comprehend the text,
+    which will likely range from around 0 to 15 (although it is not limited to this range).
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _flesch kincaid grade level:
+        https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
+    .. _textstat: https://pypi.org/project/textstat/
+    """
+    return make_metric(
+        eval_fn=_flesch_kincaid_eval_fn,
+        greater_is_better=False,
+        name="flesch_kincaid_grade_level",
+        version="v1",
+    )
+
+
+@experimental
+def ari_grade_level() -> EvaluationMetric:
+    """
+    This function will create a metric for calculating `automated readability index`_ using
+    `textstat`_.
+
+    This metric outputs a number that approximates the grade level needed to comprehend the text,
+    which will likely range from around 0 to 15 (although it is not limited to this range).
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _automated readability index: https://en.wikipedia.org/wiki/Automated_readability_index
+    .. _textstat: https://pypi.org/project/textstat/
+    """
+    return make_metric(
+        eval_fn=_ari_eval_fn,
+        greater_is_better=False,
+        name="ari_grade_level",
+        long_name="automated_readability_index_grade_level",
+        version="v1",
+    )
 
-.. _automated readability index: https://en.wikipedia.org/wiki/Automated_readability_index
-.. _textstat: https://pypi.org/project/textstat/
-"""
 
 # question answering metrics
+@experimental
+def exact_match() -> EvaluationMetric:
+    """
+    This function will create a metric for calculating `accuracy`_ using sklearn.
 
-exact_match = make_metric(
-    eval_fn=_accuracy_eval_fn, greater_is_better=True, name="exact_match", version="v1"
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
+    This metric only computes an aggregate score which ranges from 0 to 1.
 
-A metric for calculating `accuracy`_ using sklearn.
+    .. _accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
+    """
+    return make_metric(
+        eval_fn=_accuracy_eval_fn, greater_is_better=True, name="exact_match", version="v1"
+    )
 
-This metric only computes an aggregate score which ranges from 0 to 1.
-
-.. _accuracy: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html
-"""
 
 # text summarization metrics
+@experimental
+def rouge1() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `rouge1`_.
+
+    The score ranges from 0 to 1, where a higher score indicates higher similarity.
+    `rouge1`_ uses unigram based scoring to calculate similarity.
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _rouge1: https://huggingface.co/spaces/evaluate-metric/rouge
+    """
+    return make_metric(
+        eval_fn=_rouge1_eval_fn,
+        greater_is_better=True,
+        name="rouge1",
+        version="v1",
+    )
+
+
+@experimental
+def rouge2() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `rouge2`_.
+
+    The score ranges from 0 to 1, where a higher score indicates higher similarity.
+    `rouge2`_ uses bigram based scoring to calculate similarity.
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _rouge2: https://huggingface.co/spaces/evaluate-metric/rouge
+    """
+    return make_metric(
+        eval_fn=_rouge2_eval_fn,
+        greater_is_better=True,
+        name="rouge2",
+        version="v1",
+    )
+
+
+@experimental
+def rougeL() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `rougeL`_.
+
+    The score ranges from 0 to 1, where a higher score indicates higher similarity.
+    `rougeL`_ uses unigram based scoring to calculate similarity.
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _rougeL: https://huggingface.co/spaces/evaluate-metric/rouge
+    """
+    return make_metric(
+        eval_fn=_rougeL_eval_fn,
+        greater_is_better=True,
+        name="rougeL",
+        version="v1",
+    )
+
+
+@experimental
+def rougeLsum() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `rougeLsum`_.
+
+    The score ranges from 0 to 1, where a higher score indicates higher similarity.
+    `rougeLsum`_ uses longest common subsequence based scoring to calculate similarity.
+
+    Aggregations calculated for this metric:
+        - mean
+
+    .. _rougeLsum: https://huggingface.co/spaces/evaluate-metric/rouge
+    """
+    return make_metric(
+        eval_fn=_rougeLsum_eval_fn,
+        greater_is_better=True,
+        name="rougeLsum",
+        version="v1",
+    )
+
+
+@experimental
+def precision_at_k(k) -> EvaluationMetric:
+    """
+    This function will create a metric for calculating ``precision_at_k`` for retriever models.
+
+    It is recommended to use a static dataset (Pandas Dataframe or MLflow Pandas Dataset)
+    containing columns for: input queries, retrieved relevant doc IDs, and ground-truth doc IDs. A
+    "doc ID" is a string that uniquely identifies a document. All doc IDs should be entered as a
+    tuple of doc ID strings.
+
+    The ``targets`` parameter should specify the column name of the ground-truth relevant doc IDs.
+
+    If you choose to use a static dataset, the ``predictions`` parameter should specify the column
+    name of the retrieved relevant doc IDs. Alternatively, if you choose to specify a function for
+    the ``model`` parameter, the function should take a Pandas DataFrame as input and return a
+    Pandas DataFrame with a column of retrieved relevant doc IDs, specified by the ``predictions``
+    parameter.
+
+    ``k`` should be a positive integer specifying the number of retrieved doc IDs to consider for
+    each input query. ``k`` defaults to 3.
 
-rouge1 = make_metric(
-    eval_fn=_rouge1_eval_fn,
-    greater_is_better=True,
-    name="rouge1",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
+    This metric computes a score between 0 and 1 for each row representing the precision of the
+    retriever model at the given ``k`` value. If no relevant documents are retrieved, the score is
+    0, indicating that no relevant docs were retrieved. Let ``x = min(k, # of retrieved doc IDs)``.
+    Then, the precision at k is calculated as follows:
+
+        ``precision_at_k`` = (# of relevant retrieved doc IDs in top-``x`` ranked docs) / ``x``.
+
+    This metric is a builtin metric for the ``'retriever'`` model type, meaning it will be
+    automatically calculated with a default ``k`` value of 3. To use another ``k`` value, you have
+    two options with the :py:func:`mlflow.evaluate` API:
+
+    1. ``evaluator_config={"k": 5}``
+    2. ``extra_metrics = [mlflow.metrics.precision_at_k(k=5)]``
 
-A metric for evaluating `rouge1`_.
-    
-The score ranges from 0 to 1, where a higher score indicates higher similarity.
-`rouge1`_ uses unigram based scoring to calculate similarity.
+        Note that the ``k`` value in the ``evaluator_config`` will be ignored in this case. It is
+        recommended to remove the ``model_type`` as well, or else precision@3 and precision@5 will
+        both be calculated.
+    """
+    return make_metric(
+        eval_fn=_precision_at_k_eval_fn(k),
+        greater_is_better=True,
+        name="precision_at_k",
+        version="v1",
+    )
 
-Aggregations calculated for this metric:
-    - mean
 
-.. _rouge1: https://huggingface.co/spaces/evaluate-metric/rouge
-"""
+# General Regression Metrics
+def mae() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `mae`_.
 
-rouge2 = make_metric(
-    eval_fn=_rouge2_eval_fn,
-    greater_is_better=True,
-    name="rouge2",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
+    This metric computes an aggregate score for the mean absolute error for regression.
 
-A metric for evaluating `rouge2`_.
-    
-The score ranges from 0 to 1, where a higher score indicates higher similarity.
-`rouge2`_ uses bigram based scoring to calculate similarity.
+    .. _mae: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
+    """
+    return make_metric(
+        eval_fn=_mae_eval_fn,
+        greater_is_better=False,
+        name="mean_absolute_error",
+    )
 
-Aggregations calculated for this metric:
-    - mean
 
-.. _rouge2: https://huggingface.co/spaces/evaluate-metric/rouge
-"""
+def mse() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `mse`_.
 
-rougeL = make_metric(
-    eval_fn=_rougeL_eval_fn,
-    greater_is_better=True,
-    name="rougeL",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
+    This metric computes an aggregate score for the mean squared error for regression.
 
-A metric for evaluating `rougeL`_.
-    
-The score ranges from 0 to 1, where a higher score indicates higher similarity.
-`rougeL`_ uses unigram based scoring to calculate similarity.
+    .. _mse: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
+    """
+    return make_metric(
+        eval_fn=_mse_eval_fn,
+        greater_is_better=False,
+        name="mean_squared_error",
+    )
 
-Aggregations calculated for this metric:
-    - mean
 
-.. _rougeL: https://huggingface.co/spaces/evaluate-metric/rouge
-"""
+def rmse() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating the square root of `mse`_.
 
-rougeLsum = make_metric(
-    eval_fn=_rougeLsum_eval_fn,
-    greater_is_better=True,
-    name="rougeLsum",
-    version="v1",
-)
-"""
-.. Note:: Experimental: This metric may change or be removed in a future release without warning.
+    This metric computes an aggregate score for the root mean absolute error for regression.
 
-A metric for evaluating `rougeLsum`_.
-    
-The score ranges from 0 to 1, where a higher score indicates higher similarity.
-`rougeLsum`_ uses longest common subsequence based scoring to calculate similarity.
+    .. _mse: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
+    """
 
-Aggregations calculated for this metric:
-    - mean
+    return make_metric(
+        eval_fn=_rmse_eval_fn,
+        greater_is_better=False,
+        name="root_mean_squared_error",
+    )
 
-.. _rougeLsum: https://huggingface.co/spaces/evaluate-metric/rouge
-"""
 
-# General Regression Metrics
+def r2_score() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `r2_score`_.
 
-mae = make_metric(
-    eval_fn=_mae_eval_fn,
-    greater_is_better=False,
-    name="mean_absolute_error",
-)
-"""
-A metric for evaluating `mae`_.
+    This metric computes an aggregate score for the coefficient of determination. R2 ranges from
+    negative infinity to 1, and measures the percentage of variance explained by the predictor
+    variables in a regression.
 
-This metric computes an aggregate score for the mean absolute error for regression.
+    .. _r2_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
+    """
+    return make_metric(
+        eval_fn=_r2_score_eval_fn,
+        greater_is_better=True,
+        name="r2_score",
+    )
 
-.. _mae: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html
-"""
 
-mse = make_metric(
-    eval_fn=_mse_eval_fn,
-    greater_is_better=False,
-    name="mean_squared_error",
-)
-"""
-A metric for evaluating `mse`_.
+def max_error() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `max_error`_.
 
-This metric computes an aggregate score for the mean squared error for regression.
+    This metric computes an aggregate score for the maximum residual error for regression.
 
-.. _mse: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
-"""
+    .. _max_error: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.max_error.html
+    """
+    return make_metric(
+        eval_fn=_max_error_eval_fn,
+        greater_is_better=False,
+        name="max_error",
+    )
 
-rmse = make_metric(
-    eval_fn=_rmse_eval_fn,
-    greater_is_better=False,
-    name="root_mean_squared_error",
-)
-"""
-A metric for evaluating the square root of `mse`_.
 
-This metric computes an aggregate score for the root mean absolute error for regression.
+def mape() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `mape`_.
 
-.. _mse: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html
-"""
+    This metric computes an aggregate score for the mean absolute percentage error for regression.
 
-r2_score = make_metric(
-    eval_fn=_r2_score_eval_fn,
-    greater_is_better=True,
-    name="r2_score",
-)
-"""
-A metric for evaluating `r2_score`_.
-
-This metric computes an aggregate score for the coefficient of determination. R2 ranges from
-negative infinity to 1, and measures the percentage of variance explained by the predictor 
-variables in a regression.
-
-.. _r2_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html
-"""
-
-max_error = make_metric(
-    eval_fn=_max_error_eval_fn,
-    greater_is_better=False,
-    name="max_error",
-)
-"""
-A metric for evaluating `max_error`_.
+    .. _mape: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html
+    """
+    return make_metric(
+        eval_fn=_mape_eval_fn,
+        greater_is_better=False,
+        name="mean_absolute_percentage_error",
+    )
 
-This metric computes an aggregate score for the maximum residual error for regression.
 
-.. _max_error: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.max_error.html
-"""
+# Binary Classification Metrics
 
-mape = make_metric(
-    eval_fn=_mape_eval_fn,
-    greater_is_better=False,
-    name="mean_absolute_percentage_error",
-)
-"""
-A metric for evaluating `mape`_.
 
-This metric computes an aggregate score for the mean absolute percentage error for regression.
+def recall_score() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `recall`_ for classification.
 
-.. _mape: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html
-"""
+    This metric computes an aggregate score between 0 and 1 for the recall of a classification task.
 
-# Binary Classification Metrics
+    .. _recall: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html
+    """
+    return make_metric(eval_fn=_recall_eval_fn, greater_is_better=True, name="recall_score")
 
-recall_score = make_metric(eval_fn=_recall_eval_fn, greater_is_better=True, name="recall_score")
-"""
-A metric for evaluating `recall`_ for classification.
 
-This metric computes an aggregate score between 0 and 1 for the recall of a classification task.
+def precision_score() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `precision`_ for classification.
 
-.. _recall: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html
-"""
+    This metric computes an aggregate score between 0 and 1 for the precision of
+    classification task.
 
-precision_score = make_metric(
-    eval_fn=_precision_eval_fn, greater_is_better=True, name="precision_score"
-)
-"""
-A metric for evaluating `precision`_ for classification.
+    .. _precision: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
+    """
+    return make_metric(eval_fn=_precision_eval_fn, greater_is_better=True, name="precision_score")
 
-This metric computes an aggregate score between 0 and 1 for the precision of
-classification task. 
 
-.. _precision: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
-"""
+def f1_score() -> EvaluationMetric:
+    """
+    This function will create a metric for evaluating `f1_score`_ for binary classification.
 
-f1_score = make_metric(eval_fn=_f1_score_eval_fn, greater_is_better=True, name="f1_score")
-"""
-A metric for evaluating `f1_score`_ for binary classification.
+    This metric computes an aggregate score between 0 and 1 for the F1 score (F-measure) of a
+    classification task. F1 score is defined as 2 * (precision * recall) / (precision + recall).
 
-This metric computes an aggregate score between 0 and 1 for the F1 score (F-measure) of a
-classification task. F1 score is defined as 2 * (precision * recall) / (precision + recall).
+    .. _f1_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
+    """
+    return make_metric(eval_fn=_f1_score_eval_fn, greater_is_better=True, name="f1_score")
 
-.. _f1_score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
-"""
 
 __all__ = [
     "EvaluationExample",
     "EvaluationMetric",
     "MetricValue",
     "make_metric",
-    "perplexity",
     "flesch_kincaid_grade_level",
     "ari_grade_level",
     "accuracy",
@@ -387,9 +443,10 @@
     "binary_recall",
     "binary_precision",
     "binary_f1_score",
-    "correctness",
-    "relevance",
-    "strict_correctness",
+    "answer_similarity",
+    "faithfulness",
+    "answer_correctness",
+    "answer_relevance",
     "token_count",
     "latency",
 ]
diff --git a/mlflow/metrics/base.py b/mlflow/metrics/base.py
index fd1293f7c327e..0afd104aff6a7 100644
--- a/mlflow/metrics/base.py
+++ b/mlflow/metrics/base.py
@@ -16,7 +16,7 @@ class MetricValue:
     """
 
     scores: List[float] = None
-    justifications: List[float] = None
+    justifications: List[str] = None
     aggregate_results: Dict[str, float] = None
 
 
@@ -75,29 +75,30 @@ class EvaluationExample:
     input: str
     output: str
     score: float
-    justification: str = None
+    justification: str
     grading_context: Dict[str, str] = None
 
+    def _format_grading_context(self):
+        return "\n".join(
+            [f"key: {key}\nvalue:\n{value}" for key, value in self.grading_context.items()]
+        )
+
     def __str__(self) -> str:
         grading_context = (
             ""
             if self.grading_context is None
-            else "\n".join(
-                [f"Provided {key}: {value}" for key, value in self.grading_context.items()]
-            )
+            else "Additional information used by the model:\n" f"{self._format_grading_context()}"
         )
 
-        justification = ""
-        if self.justification is not None:
-            justification = f"Justification: {self.justification}\n"
-
         return f"""
-Input: {self.input}
+Input:
+{self.input}
 
-Provided output: {self.output}
+Output:
+{self.output}
 
 {grading_context}
 
-Score: {self.score}
-{justification}
+score: {self.score}
+justification: {self.justification}
         """
diff --git a/mlflow/metrics/genai/genai_metric.py b/mlflow/metrics/genai/genai_metric.py
index 151ccc7a530cb..879d4addaefa7 100644
--- a/mlflow/metrics/genai/genai_metric.py
+++ b/mlflow/metrics/genai/genai_metric.py
@@ -3,14 +3,20 @@
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from inspect import Parameter, Signature
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
 
 from mlflow.exceptions import MlflowException
 from mlflow.metrics.base import EvaluationExample, MetricValue
 from mlflow.metrics.genai import model_utils
-from mlflow.metrics.genai.utils import _get_latest_metric_version
+from mlflow.metrics.genai.utils import _get_default_model, _get_latest_metric_version
 from mlflow.models import EvaluationMetric, make_metric
-from mlflow.protos.databricks_pb2 import INTERNAL_ERROR, INVALID_PARAMETER_VALUE
+from mlflow.protos.databricks_pb2 import (
+    BAD_REQUEST,
+    INTERNAL_ERROR,
+    INVALID_PARAMETER_VALUE,
+    UNAUTHENTICATED,
+    ErrorCode,
+)
 from mlflow.utils.annotations import experimental
 from mlflow.utils.class_utils import _get_class_from_string
 
@@ -33,7 +39,12 @@ def _format_args_string(grading_context_columns: Optional[List[str]], eval_value
     return (
         ""
         if args_dict is None
-        else "\n".join(f"Provided {arg}: {arg_value}" for arg, arg_value in args_dict.items())
+        else (
+            "Additional information used by the model:\n"
+            + "\n".join(
+                [f"key: {arg}\nvalue:\n{arg_value}" for arg, arg_value in args_dict.items()]
+            )
+        )
     )
 
 
@@ -51,20 +62,20 @@ def _extract_score_and_justification(output):
         # Attempt to parse JSON
         try:
             data = json.loads(text)
-            score = int(data.get("Score"))
-            justification = data.get("Justification")
+            score = int(data.get("score"))
+            justification = data.get("justification")
         except json.JSONDecodeError:
             # If parsing fails, use regex
-            match = re.search(r"Score: (\d+),?\s*Justification: (.+)", text)
+            match = re.search(r"score: (\d+),?\s*justification: (.+)", text)
             if match:
                 score = int(match.group(1))
                 justification = match.group(2)
             else:
                 score = None
-                justification = None
+                justification = f"Failed to extract score and justification. Raw output: {output}"
 
         if not isinstance(score, (int, float)) or not isinstance(justification, str):
-            return None, None
+            return None, f"Failed to extract score and justification. Raw output: {output}"
 
         return score, justification
 
@@ -78,13 +89,13 @@ def make_genai_metric(
     grading_prompt: str,
     examples: Optional[List[EvaluationExample]] = None,
     version: Optional[str] = _get_latest_metric_version(),
-    model: Optional[str] = "openai:/gpt-3.5-turbo-16k",
-    grading_context_columns: Optional[List[str]] = None,
+    model: Optional[str] = _get_default_model(),
+    grading_context_columns: Optional[Union[str, List[str]]] = [],  # noqa: B006
     parameters: Optional[Dict[str, Any]] = None,
-    aggregations: Optional[List[str]] = None,
+    aggregations: Optional[List[str]] = ["mean", "variance", "p90"],  # noqa: B006
     greater_is_better: bool = True,
     max_workers: int = 10,
-    judge_request_timeout: int = 15,
+    judge_request_timeout: int = 60,
 ) -> EvaluationMetric:
     """
     Create a genai metric used to evaluate LLM using LLM as a judge in MLflow.
@@ -94,19 +105,25 @@ def make_genai_metric(
     :param grading_prompt: Grading criteria of the metric.
     :param examples: (Optional) Examples of the metric.
     :param version: (Optional) Version of the metric. Currently supported versions are: v1.
-    :param model: (Optional) Model uri of the metric.
-    :param grading_context_columns: (Optional) grading_context_columns required to compute
-        the metric. These grading_context_columns are used by the LLM as a judge as additional
-        information to compute the metric. The columns are extracted from the input dataset or
-        output predictions based on col_mapping in evaluator_config.
-    :param parameters: (Optional) Parameters for the llm used to compute the metric.
+    :param model: (Optional) Model uri of the of an openai or gateway judge model in the format of
+        "openai:/gpt-4" or "gateway:/my-route". Defaults to
+        "openai:/gpt-4". Your use of a third party LLM service (e.g., OpenAI) for
+        evaluation may be subject to and governed by the LLM service's terms of use.
+    :param grading_context_columns: (Optional) The name of the grading context column, or a list of
+        grading context column names, required to compute the metric. The
+        ``grading_context_columns`` are used by the LLM as a judge as additional information to
+        compute the metric. The columns are extracted from the input dataset or output predictions
+        based on ``col_mapping`` in the ``evaluator_config`` passed to :py:func:`mlflow.evaluate()`.
+    :param parameters: (Optional) Parameters for the LLM used to compute the metric. By default, we
+        set the temperature to 0.0, max_tokens to 200, and top_p to 1.0. We recommend
+        setting the temperature to 0.0 for the LLM used as a judge to ensure consistent results.
     :param aggregations: (Optional) The list of options to aggregate the scores. Currently supported
         options are: min, max, mean, median, variance, p90.
     :param greater_is_better: (Optional) Whether the metric is better when it is greater.
     :param max_workers: (Optional) The maximum number of workers to use for judge scoring.
         Defaults to 10 workers.
     :param judge_request_timeout: (Optional) The timeout in seconds for each judge scoring request.
-        Defaults to 15 seconds.
+        Defaults to 60 seconds.
 
     :return: A metric object.
 
@@ -128,7 +145,7 @@ def make_genai_metric(
                 "its purpose, and its developer. It could be more concise for a 5-score.",
             ),
             grading_context={
-                "ground_truth": (
+                "targets": (
                     "MLflow is an open-source platform for managing "
                     "the end-to-end machine learning (ML) lifecycle. It was developed by "
                     "Databricks, a company that specializes in big data and machine learning "
@@ -140,38 +157,63 @@ def make_genai_metric(
         )
 
         metric = make_genai_metric(
-            name="correctness",
+            name="answer_correctness",
             definition=(
-                "Correctness refers to how well the generated output matches "
-                "or aligns with the reference or ground truth text that is considered "
-                "accurate and appropriate for the given input. The ground truth serves as "
-                "a benchmark against which the provided output is compared to determine the "
-                "level of accuracy and fidelity."
+                "Answer correctness is evaluated on the accuracy of the provided output based on "
+                "the provided targets, which is the ground truth. Scores can be assigned based on "
+                "the degree of semantic similarity and factual correctness of the provided output "
+                "to the provided targets, where a higher score indicates higher degree of accuracy."
             ),
             grading_prompt=(
-                "Correctness: If the answer correctly answer the question, below "
-                "are the details for different scores: "
-                "- Score 0: the answer is completely incorrect, doesn’t mention anything about "
-                "the question or is completely contrary to the correct answer. "
-                "- Score 1: the answer provides some relevance to the question and answer "
-                "one aspect of the question correctly. "
-                "- Score 2: the answer mostly answer the question but is missing or hallucinating "
-                "on one critical aspect. "
-                "- Score 4: the answer correctly answer the question and not missing any "
-                "major aspect"
+                "Answer correctness: Below are the details for different scores:"
+                "- Score 1: The output is completely incorrect. It is completely different from "
+                "or contradicts the provided targets."
+                "- Score 2: The output demonstrates some degree of semantic similarity and "
+                "includes partially correct information. However, the output still has significant "
+                "discrepancies with the provided targets or inaccuracies."
+                "- Score 3: The output addresses a couple of aspects of the input accurately, "
+                "aligning with the provided targets. However, there are still omissions or minor "
+                "inaccuracies."
+                "- Score 4: The output is mostly correct. It provides mostly accurate information, "
+                "but there may be one or more minor omissions or inaccuracies."
+                "- Score 5: The output is correct. It demonstrates a high degree of accuracy and "
+                "semantic similarity to the targets."
             ),
             examples=[example],
             version="v1",
-            model="gateway:/gpt4",
-            grading_context_columns=["ground_truth"],
-            parameters={"temperature": 1.0},
+            model="openai:/gpt-4",
+            grading_context_columns=["targets"],
+            parameters={"temperature": 0.0},
             aggregations=["mean", "variance", "p90"],
             greater_is_better=True,
         )
     """
-
-    if aggregations is None:
-        aggregations = ["mean", "variance", "p90"]
+    if not isinstance(grading_context_columns, list):
+        grading_context_columns = [grading_context_columns]
+
+    class_name = f"mlflow.metrics.genai.prompts.{version}.EvaluationModel"
+    try:
+        evaluation_model_class_module = _get_class_from_string(class_name)
+    except ModuleNotFoundError:
+        raise MlflowException(
+            f"Failed to find evaluation model for version {version}."
+            f"Please check the correctness of the version",
+            error_code=INVALID_PARAMETER_VALUE,
+        ) from None
+    except Exception as e:
+        raise MlflowException(
+            f"Failed to construct evaluation model {version}. Error: {e!r}",
+            error_code=INTERNAL_ERROR,
+        ) from None
+
+    evaluation_context = evaluation_model_class_module(
+        name,
+        definition,
+        grading_prompt,
+        examples,
+        model,
+        *(parameters,) if parameters is not None else (),
+    ).to_dict()
 
     def eval_fn(
         predictions: "pd.Series",
@@ -182,31 +224,7 @@ def eval_fn(
         """
         This is the function that is called when the metric is evaluated.
         """
-
         eval_values = dict(zip(grading_context_columns, args))
-        class_name = f"mlflow.metrics.genai.prompts.{version}.EvaluationModel"
-        try:
-            evaluation_model_class_module = _get_class_from_string(class_name)
-        except ModuleNotFoundError:
-            raise MlflowException(
-                f"Failed to find evaluation model for version {version}."
-                f"Please check the correctness of the version",
-                error_code=INVALID_PARAMETER_VALUE,
-            ) from None
-        except Exception as e:
-            raise MlflowException(
-                f"Failed to construct evaluation model {version}. Error: {e!r}",
-                error_code=INTERNAL_ERROR,
-            ) from None
-
-        evaluation_context = evaluation_model_class_module(
-            name,
-            definition,
-            grading_prompt,
-            examples,
-            model,
-            *(parameters,) if parameters is not None else (),
-        ).to_dict()
 
         outputs = predictions.to_list()
         inputs = inputs.to_list()
@@ -233,7 +251,21 @@ def score_model_on_one_payload(
             eval_parameters,
             eval_model,
         ):
-            arg_string = _format_args_string(grading_context_columns, eval_values, indx)
+            try:
+                arg_string = _format_args_string(grading_context_columns, eval_values, indx)
+            except Exception as e:
+                raise MlflowException(
+                    f"Values for grading_context_columns are malformed and cannot be "
+                    f"formatted into a prompt for metric '{name}'.\n"
+                    f"Required columns: {grading_context_columns}\n"
+                    f"Values: {eval_values}\n"
+                    f"Error: {e!r}\n"
+                    f"Please check the following: \n"
+                    "- predictions and targets (if required) are provided correctly\n"
+                    "- grading_context_columns are mapped correctly using the evaluator_config "
+                    "parameter\n"
+                    "- input and output data are formatted correctly."
+                )
             payload = {
                 "prompt": evaluation_context["eval_prompt"].format(
                     input=input, output=output, grading_context_columns=arg_string
@@ -241,11 +273,18 @@ def score_model_on_one_payload(
                 **eval_parameters,
             }
             try:
-                raw_result = model_utils.score_model_on_payload(eval_model, payload)
+                raw_result = model_utils.score_model_on_payload(
+                    eval_model, payload, judge_request_timeout
+                )
                 return _extract_score_and_justification(raw_result)
             except Exception as e:
-                _logger.info(f"Failed to score model on payload. Error: {e!r}")
-                return None, None
+                if isinstance(e, MlflowException):
+                    if e.error_code in [
+                        ErrorCode.Name(BAD_REQUEST),
+                        ErrorCode.Name(UNAUTHENTICATED),
+                    ]:
+                        raise MlflowException(e)
+                return None, f"Failed to score model on payload. Error: {e!s}"
 
         scores = [None] * len(inputs)
         justifications = [None] * len(inputs)
@@ -316,5 +355,9 @@ def aggregate_function(aggregate_option, scores):
     eval_fn.__signature__ = Signature(signature_parameters)
 
     return make_metric(
-        eval_fn=eval_fn, greater_is_better=greater_is_better, name=name, version=version
+        eval_fn=eval_fn,
+        greater_is_better=greater_is_better,
+        name=name,
+        version=version,
+        metric_details=evaluation_context["eval_prompt"].__str__(),
     )
diff --git a/mlflow/metrics/genai/metric_definitions.py b/mlflow/metrics/genai/metric_definitions.py
index 61e013fda381b..1994853942d0e 100644
--- a/mlflow/metrics/genai/metric_definitions.py
+++ b/mlflow/metrics/genai/metric_definitions.py
@@ -11,187 +11,260 @@
 
 
 @experimental
-def correctness(
+def answer_similarity(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
+    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
-    This function will create a genai metric used to evaluate the correctness of an LLM using the
-    model provided. Correctness will be assessed by the similarity in meaning and description to
-    the ``ground_truth``.
+    This function will create a genai metric used to evaluate the answer similarity of an LLM
+    using the model provided. Answer similarity will be assessed by the semantic similarity of the
+    output to the ``ground_truth``, which should be specified in the ``targets`` column.
 
-    The ``ground_truth`` eval_arg must be provided as part of the input dataset or output
-    predictions. This can be mapped to a column of a different name using the a ``col_mapping``
-    in the ``evaluator_config``.
+    The ``targets`` eval_arg must be provided as part of the input dataset or output
+    predictions. This can be mapped to a column of a different name using ``col_mapping``
+    in the ``evaluator_config`` parameter, or using the ``targets`` parameter in mlflow.evaluate().
 
     An MlflowException will be raised if the specified version for this metric does not exist.
 
-    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to GPT-4.
-    :param metric_version: (Optional) The version of the correctness metric to use.
+    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to
+        gpt-4. Your use of a third party LLM service (e.g., OpenAI) for evaluation may
+        be subject to and governed by the LLM service's terms of use.
+    :param metric_version: (Optional) The version of the answer similarity metric to use.
         Defaults to the latest version.
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
-        correctness. It is highly recommended to add examples to be used as a reference to
+        answer similarity. It is highly recommended to add examples to be used as a reference to
         evaluate the new results.
+    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
+        Defaults to 60 seconds.
     :return: A metric object
     """
     if metric_version is None:
         metric_version = _get_latest_metric_version()
-    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.CorrectnessMetric"
+    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerSimilarityMetric"
     try:
-        correctness_class_module = _get_class_from_string(class_name)
+        answer_similarity_class_module = _get_class_from_string(class_name)
     except ModuleNotFoundError:
         raise MlflowException(
-            f"Failed to find correctness metric for version {metric_version}."
+            f"Failed to find answer similarity metric for version {metric_version}."
             f" Please check the version",
             error_code=INVALID_PARAMETER_VALUE,
         ) from None
     except Exception as e:
         raise MlflowException(
-            f"Failed to construct correctness metric {metric_version}. Error: {e!r}",
+            f"Failed to construct answer similarity metric {metric_version}. Error: {e!r}",
             error_code=INTERNAL_ERROR,
         ) from None
 
     if examples is None:
-        examples = correctness_class_module.default_examples
+        examples = answer_similarity_class_module.default_examples
     if model is None:
-        model = correctness_class_module.default_model
+        model = answer_similarity_class_module.default_model
 
     return make_genai_metric(
-        name="correctness",
-        definition=correctness_class_module.definition,
-        grading_prompt=correctness_class_module.grading_prompt,
+        name="answer_similarity",
+        definition=answer_similarity_class_module.definition,
+        grading_prompt=answer_similarity_class_module.grading_prompt,
         examples=examples,
         version=metric_version,
         model=model,
-        grading_context_columns=correctness_class_module.grading_context_columns,
-        parameters=correctness_class_module.parameters,
+        grading_context_columns=answer_similarity_class_module.grading_context_columns,
+        parameters=answer_similarity_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        judge_request_timeout=judge_request_timeout,
     )
 
 
 @experimental
-def strict_correctness(
+def answer_correctness(
     model: Optional[str] = None,
     metric_version: Optional[str] = None,
     examples: Optional[List[EvaluationExample]] = None,
+    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
-    This function will create a genai metric used to evaluate the strict correctness of an LLM
-    using the model provided. Strict correctness should be used in cases where correctness is
-    binary, and the source of truth is provided in the ``ground_truth``. Outputs will be
-    given either the highest or lowest score depending on if they are consistent with the
-    ``ground_truth``. When dealing with inputs that may have multiple correct outputs, varying
-    degrees of correctness, or when considering other factors such as the comprehensiveness of
-    the output, it is more appropriate to use the correctness metric instead.
-
-    The ``ground_truth`` eval_arg must be provided as part of the input dataset or output
-    predictions. This can be mapped to a column of a different name using the a ``col_mapping``
-    in the ``evaluator_config``.
+    This function will create a genai metric used to evaluate the answer correctness of an LLM
+    using the model provided. Answer correctness will be assessed by the accuracy of the provided
+    output based on the ``ground_truth``, which should be specified in the ``targets`` column.
+
+    The ``targets`` eval_arg must be provided as part of the input dataset or output
+    predictions. This can be mapped to a column of a different name using ``col_mapping``
+    in the ``evaluator_config`` parameter, or using the ``targets`` parameter in mlflow.evaluate().
 
     An MlflowException will be raised if the specified version for this metric does not exist.
 
-    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to GPT-4.
-    :param metric_version: (Optional) The version of the strict correctness metric to use.
+    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to
+        gpt-4. Your use of a third party LLM service (e.g., OpenAI) for evaluation may
+        be subject to and governed by the LLM service's terms of use.
+    :param metric_version: (Optional) The version of the answer correctness metric to use.
         Defaults to the latest version.
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
-        strict correctness. It is highly recommended to add examples to be used as a reference to
+        answer correctness. It is highly recommended to add examples to be used as a reference to
         evaluate the new results.
+    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
+        Defaults to 60 seconds.
     :return: A metric object
     """
     if metric_version is None:
         metric_version = _get_latest_metric_version()
-    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.StrictCorrectnessMetric"
+    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerCorrectnessMetric"
     try:
-        strict_correctness_class_module = _get_class_from_string(class_name)
+        answer_correctness_class_module = _get_class_from_string(class_name)
     except ModuleNotFoundError:
         raise MlflowException(
-            f"Failed to find strict correctness metric for version {metric_version}."
+            f"Failed to find answer correctness metric for version {metric_version}."
             f"Please check the version",
             error_code=INVALID_PARAMETER_VALUE,
         ) from None
     except Exception as e:
         raise MlflowException(
-            f"Failed to construct strict correctness metric {metric_version}. Error: {e!r}",
+            f"Failed to construct answer correctness metric {metric_version}. Error: {e!r}",
             error_code=INTERNAL_ERROR,
         ) from None
 
     if examples is None:
-        examples = strict_correctness_class_module.default_examples
+        examples = answer_correctness_class_module.default_examples
     if model is None:
-        model = strict_correctness_class_module.default_model
+        model = answer_correctness_class_module.default_model
 
     return make_genai_metric(
-        name="strict_correctness",
-        definition=strict_correctness_class_module.definition,
-        grading_prompt=strict_correctness_class_module.grading_prompt,
+        name="answer_correctness",
+        definition=answer_correctness_class_module.definition,
+        grading_prompt=answer_correctness_class_module.grading_prompt,
         examples=examples,
         version=metric_version,
         model=model,
-        grading_context_columns=strict_correctness_class_module.grading_context_columns,
-        parameters=strict_correctness_class_module.parameters,
+        grading_context_columns=answer_correctness_class_module.grading_context_columns,
+        parameters=answer_correctness_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        judge_request_timeout=judge_request_timeout,
     )
 
 
 @experimental
-def relevance(
+def faithfulness(
     model: Optional[str] = None,
-    metric_version: Optional[str] = None,
+    metric_version: Optional[str] = _get_latest_metric_version(),
     examples: Optional[List[EvaluationExample]] = None,
+    judge_request_timeout=60,
 ) -> EvaluationMetric:
     """
-    This function will create a genai metric used to evaluate the relevance of an LLM using the
-    model provided. Relevance will be assessed by the appropriateness, significance, and
-    applicability of the output with respect to the ``input`` and ``context``.
+    This function will create a genai metric used to evaluate the faithfullness of an LLM using the
+    model provided. Faithfulness will be assessed based on how factually consistent the output
+    is to the ``context``.
 
-    The ``input`` and ``context`` args must be provided as part of the input dataset or output
-    predictions. This can be mapped to a column of a different name using the a ``col_mapping``
-    in the ``evaluator_config``.
+    The ``context`` eval_arg must be provided as part of the input dataset or output
+    predictions. This can be mapped to a column of a different name using ``col_mapping``
+    in the ``evaluator_config`` parameter.
 
     An MlflowException will be raised if the specified version for this metric does not exist.
 
-    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to GPT-4.
-    :param metric_version: (Optional) The version of the relevance metric to use.
+    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to
+        gpt-4. Your use of a third party LLM service (e.g., OpenAI) for evaluation may
+        be subject to and governed by the LLM service's terms of use.
+    :param metric_version: (Optional) The version of the faithfulness metric to use.
         Defaults to the latest version.
     :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
-        relevance. It is highly recommended to add examples to be used as a reference to evaluate
+        faithfulness. It is highly recommended to add examples to be used as a reference to evaluate
         the new results.
+    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
+        Defaults to 60 seconds.
     :return: A metric object
     """
-    if metric_version is None:
-        metric_version = _get_latest_metric_version()
-    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.RelevanceMetric"
+    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.FaithfulnessMetric"
+    try:
+        faithfulness_class_module = _get_class_from_string(class_name)
+    except ModuleNotFoundError:
+        raise MlflowException(
+            f"Failed to find faithfulness metric for version {metric_version}."
+            f" Please check the version",
+            error_code=INVALID_PARAMETER_VALUE,
+        ) from None
+    except Exception as e:
+        raise MlflowException(
+            f"Failed to construct faithfulness metric {metric_version}. Error: {e!r}",
+            error_code=INTERNAL_ERROR,
+        ) from None
+
+    if examples is None:
+        examples = faithfulness_class_module.default_examples
+    if model is None:
+        model = faithfulness_class_module.default_model
+
+    return make_genai_metric(
+        name="faithfulness",
+        definition=faithfulness_class_module.definition,
+        grading_prompt=faithfulness_class_module.grading_prompt,
+        examples=examples,
+        version=metric_version,
+        model=model,
+        grading_context_columns=faithfulness_class_module.grading_context_columns,
+        parameters=faithfulness_class_module.parameters,
+        aggregations=["mean", "variance", "p90"],
+        greater_is_better=True,
+        judge_request_timeout=judge_request_timeout,
+    )
+
+
+@experimental
+def answer_relevance(
+    model: Optional[str] = None,
+    metric_version: Optional[str] = _get_latest_metric_version(),
+    examples: Optional[List[EvaluationExample]] = None,
+    judge_request_timeout=60,
+) -> EvaluationMetric:
+    """
+    This function will create a genai metric used to evaluate the answer relevance of an LLM
+    using the model provided. Answer relevance will be assessed based on the appropriateness and
+    applicability of the output with respect to the input.
+
+    An MlflowException will be raised if the specified version for this metric does not exist.
+
+    :param model: (Optional) The model that will be used to evaluate this metric. Defaults to
+        gpt-4. Your use of a third party LLM service (e.g., OpenAI) for evaluation may
+        be subject to and governed by the LLM service's terms of use.
+    :param metric_version: (Optional) The version of the answer relevance metric to use.
+        Defaults to the latest version.
+    :param examples: (Optional) Provide a list of examples to help the judge model evaluate the
+        answer relevance. It is highly recommended to add examples to be used as a reference to
+        evaluate the new results.
+    :param judge_request_timeout: (Optional) The timeout in seconds for the judge API request.
+        Defaults to 60 seconds.
+    :return: A metric object
+    """
+    class_name = f"mlflow.metrics.genai.prompts.{metric_version}.AnswerRelevanceMetric"
     try:
-        relevance_class_module = _get_class_from_string(class_name)
+        answer_relevance_class_module = _get_class_from_string(class_name)
     except ModuleNotFoundError:
         raise MlflowException(
-            f"Failed to find relevance metric for version {metric_version}."
+            f"Failed to find answer relevance metric for version {metric_version}."
             f" Please check the version",
             error_code=INVALID_PARAMETER_VALUE,
         ) from None
     except Exception as e:
         raise MlflowException(
-            f"Failed to construct relevance metric {metric_version}. Error: {e!r}",
+            f"Failed to construct answer relevance metric {metric_version}. Error: {e!r}",
             error_code=INTERNAL_ERROR,
         ) from None
 
     if examples is None:
-        examples = relevance_class_module.default_examples
+        examples = answer_relevance_class_module.default_examples
     if model is None:
-        model = relevance_class_module.default_model
+        model = answer_relevance_class_module.default_model
 
     return make_genai_metric(
-        name="relevance",
-        definition=relevance_class_module.definition,
-        grading_prompt=relevance_class_module.grading_prompt,
+        name="answer_relevance",
+        definition=answer_relevance_class_module.definition,
+        grading_prompt=answer_relevance_class_module.grading_prompt,
         examples=examples,
         version=metric_version,
         model=model,
-        grading_context_columns=relevance_class_module.grading_context_columns,
-        parameters=relevance_class_module.parameters,
+        grading_context_columns=answer_relevance_class_module.grading_context_columns,
+        parameters=answer_relevance_class_module.parameters,
         aggregations=["mean", "variance", "p90"],
         greater_is_better=True,
+        judge_request_timeout=judge_request_timeout,
     )
diff --git a/mlflow/metrics/genai/model_utils.py b/mlflow/metrics/genai/model_utils.py
index 102cac0b8fc7b..5643155cd9a03 100644
--- a/mlflow/metrics/genai/model_utils.py
+++ b/mlflow/metrics/genai/model_utils.py
@@ -5,20 +5,20 @@
 import requests
 
 from mlflow.exceptions import MlflowException
-from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
+from mlflow.protos.databricks_pb2 import BAD_REQUEST, INVALID_PARAMETER_VALUE, UNAUTHENTICATED
 from mlflow.utils.uri import append_to_uri_path
 
 ROUTE_TYPE = "llm/v1/completions"
 
 
 # TODO: improve this name
-def score_model_on_payload(model_uri, payload):
+def score_model_on_payload(model_uri, payload, timeout):
     """Call the model identified by the given uri with the given payload."""
 
     prefix, suffix = _parse_model_uri(model_uri)
 
     if prefix == "openai":
-        return _call_openai_api(suffix, payload)
+        return _call_openai_api(suffix, payload, timeout)
     elif prefix == "gateway":
         return _call_gateway_api(suffix, payload)
     elif prefix in ("model", "runs"):
@@ -43,7 +43,7 @@ def _parse_model_uri(model_uri):
     return scheme, path
 
 
-def _call_openai_api(openai_uri, payload):
+def _call_openai_api(openai_uri, payload, timeout):
     """Wrapper around the OpenAI API to make it compatible with the MLflow Gateway API."""
     from mlflow.gateway.config import RouteConfig
     from mlflow.gateway.providers.openai import OpenAIProvider
@@ -54,13 +54,23 @@ def _call_openai_api(openai_uri, payload):
             error_code=INVALID_PARAMETER_VALUE,
         )
 
+    config = {"openai_api_key": os.environ["OPENAI_API_KEY"]}
+    if "OPENAI_API_BASE" in os.environ:
+        config["openai_api_base"] = os.environ["OPENAI_API_BASE"]
+    if "OPENAI_API_TYPE" in os.environ:
+        config["openai_api_type"] = os.environ["OPENAI_API_TYPE"]
+    if "OPENAI_API_VERSION" in os.environ:
+        config["openai_api_version"] = os.environ["OPENAI_API_VERSION"]
+    if "OPENAI_DEPLOYMENT_NAME" in os.environ:
+        config["openai_deployment_name"] = os.environ["OPENAI_DEPLOYMENT_NAME"]
+
     route_config = RouteConfig(
         name="openai",
         route_type=ROUTE_TYPE,
         model={
             "name": openai_uri,
             "provider": "openai",
-            "config": {"openai_api_key": os.environ["OPENAI_API_KEY"]},
+            "config": config,
         },
     )
     openai_provider = OpenAIProvider(route_config)
@@ -72,8 +82,23 @@ def _call_openai_api(openai_uri, payload):
         url=append_to_uri_path(openai_provider._request_base_url, "chat/completions"),
         headers=openai_provider._request_headers,
         json=openai_provider._add_model_to_payload_if_necessary(payload),
+        timeout=timeout,
     ).json()
 
+    if "error" in resp:
+        error_type = resp["error"]["type"]
+        if error_type == "invalid_request_error":
+            raise MlflowException(
+                f"Invalid Request to OpenAI. Error response:\n {resp}", error_code=BAD_REQUEST
+            )
+        elif error_type == "authentication_error":
+            raise MlflowException(
+                f"Authentication Error for OpenAI. Error response:\n {resp}",
+                error_code=UNAUTHENTICATED,
+            )
+        else:
+            raise MlflowException(f"Error response from OpenAI:\n {resp}")
+
     return json.loads(openai_provider._prepare_completion_response_payload(resp).json())
 
 
diff --git a/mlflow/metrics/genai/prompt_template.py b/mlflow/metrics/genai/prompt_template.py
index 90870f9c913ff..64e814ab494da 100644
--- a/mlflow/metrics/genai/prompt_template.py
+++ b/mlflow/metrics/genai/prompt_template.py
@@ -59,3 +59,6 @@ def partial_fill(self, **kwargs: Any) -> "PromptTemplate":
         new_template_str = self.template_str.format_map(safe_dict)
         unfilled_variables = [var for var in self.variables if var not in kwargs.keys()]
         return PromptTemplate(template_str=new_template_str, variables=unfilled_variables)
+
+    def __str__(self):
+        return self.template_str
diff --git a/mlflow/metrics/genai/prompts/v1.py b/mlflow/metrics/genai/prompts/v1.py
index 2fbe6d90ec18f..dd28204ad0899 100644
--- a/mlflow/metrics/genai/prompts/v1.py
+++ b/mlflow/metrics/genai/prompts/v1.py
@@ -9,7 +9,7 @@
 )
 
 # TODO: Update the default_mode and default_parameters to the correct values post experimentation
-default_model = "openai:/gpt-3.5-turbo-16k"
+default_model = "openai:/gpt-4"
 default_parameters = {
     "temperature": 0.0,
     "max_tokens": 200,
@@ -17,17 +17,22 @@
 }
 grading_system_prompt_template = PromptTemplate(
     """
-Please act as an impartial judge and evaluate the quality of the provided output which
-attempts to produce output for the provided input based on a provided information.
+Task:
+You are an impartial judge. You will be given an input that was sent to a machine
+learning model, and you will be given an output that the model produced. You
+may also be given additional information that was used by the model to generate the output.
 
-You'll be given a grading format below which you'll call for each provided information,
-input and provided output to submit your justification and score to compute the {name} of
-the output.
+Your task is to determine a numerical score called {name} based on the input and output.
+A definition of {name} and a grading rubric are provided below.
+You must use the grading rubric to determine your score. You must also justify your score.
+
+Examples could be included below for reference. Make sure to use them as references and to
+understand them before completing the task.
 
 Input:
 {input}
 
-Provided output:
+Output:
 {output}
 
 {grading_context_columns}
@@ -35,15 +40,14 @@
 Metric definition:
 {definition}
 
-Below is your grading criteria:
+Grading rubric:
 {grading_prompt}
 
 {examples}
 
-And you'll need to submit your grading for the {name} of the output,
-using the following in json format:
-Score: [your score number for the {name} of the output]
-Justification: [your step by step reasoning about the {name} of the output]
+You must return the following fields in your response one below the other:
+score: Your numerical score for the model's {name} based on the rubric
+justification: Your step-by-step reasoning about the model's {name} score
     """
 )
 
@@ -67,6 +71,7 @@ def to_dict(self):
             if self.examples is None or len(self.examples) == 0
             else f"Examples:\n{self._format_examples()}"
         )
+
         return {
             "model": self.model,
             "eval_prompt": grading_system_prompt_template.partial_fill(
@@ -83,23 +88,23 @@ def _format_examples(self):
 
 
 @dataclass
-class CorrectnessMetric:
+class AnswerSimilarityMetric:
     definition = (
-        "Correctness is evaluated on the proximity of the provided output to the ground truth "
-        "in terms of meaning and description similarity. Scores can be assigned from 1 to 5 based "
-        "on the gradual similarity in meaning and description to the ground truth."
+        "Answer similarity is evaluated on the degree of semantic similarity of the provided "
+        "output to the provided targets, which is the ground truth. Scores can be assigned based "
+        "on the gradual similarity in meaning and description to the provided targets, where a "
+        "higher score indicates greater alignment between the provided output and provided targets."
     )
 
     grading_prompt = (
-        "Correctness: Below are the details for different scores:"
-        "- Score 1: the output is completely incorrect, doesn't mention anything related to the "
-        "input or is completely contrary to the provided ground truth."
-        "- Score 2: the output provides some relevance to the input and answers one aspect of the "
-        "question as in the ground truth."
-        "- Score 3: the output mostly answers the question but is missing or hallucinating on "
-        "one critical aspect."
-        "- Score 5: the output correctly answers the question and is not missing any major aspect "
-        "provided in the ground truth answer."
+        "Answer similarity: Below are the details for different scores:\n"
+        "- Score 1: the output has little to no semantic similarity to the provided targets.\n"
+        "- Score 2: the output displays partial semantic similarity to the provided targets on "
+        "some aspects.\n"
+        "- Score 3: the output has moderate semantic similarity to the provided targets.\n"
+        "- Score 4: the output aligns with the provided targets in most aspects and has "
+        "substantial semantic similarity.\n"
+        "- Score 5: the output closely aligns with the provided targets in all significant aspects."
     )
 
     grading_context_columns = ["targets"]
@@ -110,12 +115,11 @@ class CorrectnessMetric:
         input="What is MLflow?",
         output="MLflow is an open-source platform.",
         score=2,
-        justification="While the statement correctly identifies MLflow as an open-source platform, "
-        "it lacks some critical aspects mentioned in the ground truth. Specifically, it doesn't "
-        "provide information about MLflow's purpose in managing the end-to-end machine learning "
-        "lifecycle, its development by Databricks, and its focus on addressing challenges faced by "
-        "data scientists and machine learning engineers. Therefore, it answers one aspect of the "
-        "question but is missing several critical aspects provided in the ground truth.",
+        justification="The provided output is partially similar to the target, as it captures the "
+        "general idea that MLflow is an open-source platform. However, it lacks the comprehensive "
+        "details and context provided in the target about MLflow's purpose, development, and "
+        "challenges it addresses. Therefore, it demonstrates partial, but not complete, "
+        "semantic similarity.",
         grading_context={
             "targets": "MLflow is an open-source platform for managing the end-to-end "
             "machine learning (ML) lifecycle. It was developed by Databricks, a company "
@@ -132,8 +136,10 @@ class CorrectnessMetric:
         "including experiment tracking, model packaging, versioning, and deployment, simplifying "
         "the ML lifecycle.",
         score=4,
-        justification="The output effectively explains what MLflow is and its purpose. "
-        "Information about the developer of MLflow could be included for a 5-score.",
+        justification="The provided output aligns closely with the target. It covers various key "
+        "aspects mentioned in the target, including managing machine learning workflows, "
+        "experiment tracking, model packaging, versioning, and deployment. While it may not include"
+        " every single detail from the target, it demonstrates substantial semantic similarity.",
         grading_context={
             "targets": "MLflow is an open-source platform for managing the end-to-end "
             "machine learning (ML) lifecycle. It was developed by Databricks, a company "
@@ -148,23 +154,28 @@ class CorrectnessMetric:
 
 
 @dataclass
-class RelevanceMetric:
+class FaithfulnessMetric:
     definition = (
-        "Relevance encompasses the appropriateness, significance, and applicability of the output "
-        "with respect to the input and context. Scores should range from 1 to 5 and should reflect "
-        "the extent to which the output directly addresses the question provided in the input, "
-        "given the provided context."
+        "Faithfulness is only evaluated with the provided output and provided context, please "
+        "ignore the provided input entirely when scoring faithfulness. Faithfulness assesses "
+        "how much of the provided output is factually consistent with the provided context. A "
+        "higher score indicates that a higher proportion of claims present in the output can be "
+        "derived from the provided context. Faithfulness does not consider how much extra "
+        "information from the context is not present in the output."
     )
 
     grading_prompt = (
-        "Relevance: Below are the details for different scores:"
-        "- Score 1: the output doesn't mention anything about the question or is completely "
-        "irrelevant to the provided context."
-        "- Score 2: the output provides some relevance to the question and is somehow related to "
-        "the provided context."
-        "- Score 3: the output mostly answers the question and is consistent with the provided "
-        "context."
-        "- Score 5: the output answers the question comprehensively using the provided context."
+        "Faithfulness: Below are the details for different scores:\n"
+        "- Score 1: None of the claims in the output can be inferred from the provided context.\n"
+        "- Score 2: Some of the claims in the output can be inferred from the provided context, "
+        "but the majority of the output is missing from, inconsistent with, or contradictory to "
+        "the provided context.\n"
+        "- Score 3: Half or more of the claims in the output can be inferred from the provided "
+        "context.\n"
+        "- Score 4: Most of the claims in the output can be inferred from the provided context, "
+        "with very little information that is not directly supported by the provided context.\n"
+        "- Score 5: All of the claims in the output are directly supported by the provided "
+        "context, demonstrating high faithfulness to the provided context."
     )
 
     grading_context_columns = ["context"]
@@ -173,15 +184,18 @@ class RelevanceMetric:
 
     example_score_2 = EvaluationExample(
         input="How is MLflow related to Databricks?",
-        output="Databricks is a data engineering and analytics platform designed to help "
-        "organizations process and analyze large amounts of data. Databricks is a company "
-        "specializing in big data and machine learning solutions.",
+        output="Databricks is a company that specializes in big data and machine learning "
+        "solutions. MLflow has nothing to do with Databricks. MLflow is an open-source platform "
+        "for managing the end-to-end machine learning (ML) lifecycle.",
         score=2,
-        justification="The output provides relevant information about Databricks, mentioning it as "
-        "a company specializing in big data and machine learning solutions. However, it doesn't "
-        "directly address how MLflow is related to Databricks, which is the specific question "
-        "asked in the input. Therefore, the output is only somewhat related to the provided "
-        "context.",
+        justification='The output claims that "MLflow has nothing to do with Databricks" which is '
+        'contradictory to the provided context that states "It was developed by Databricks". This '
+        'is a major inconsistency. However, the output correctly identifies that "MLflow is an '
+        'open-source platform for managing the end-to-end machine learning (ML) lifecycle" and '
+        '"Databricks is a company that specializes in big data and machine learning solutions", '
+        "which are both supported by the context. Therefore, some of the claims in the output can "
+        "be inferred from the provided context, but the majority of the output is inconsistent "
+        "with the provided context, leading to a faithfulness score of 2.",
         grading_context={
             "context": "MLflow is an open-source platform for managing the end-to-end machine "
             "learning (ML) lifecycle. It was developed by Databricks, a company that specializes "
@@ -191,16 +205,16 @@ class RelevanceMetric:
         },
     )
 
-    example_score_4 = EvaluationExample(
+    example_score_5 = EvaluationExample(
         input="How is MLflow related to Databricks?",
-        output="MLflow is a product created by Databricks to enhance the efficiency of machine "
-        "learning processes.",
-        score=4,
-        justification="The output provides a relevant and accurate statement about the "
-        "relationship between MLflow and Databricks. While it doesn't provide extensive detail, "
-        "it still offers a substantial and meaningful response. To achieve a score of 5, the "
-        "response could be further improved by providing additional context or details about"
-        "how MLflow specifically functions within the Databricks ecosystem.",
+        output="Databricks is a company that specializes in big data and machine learning "
+        "solutions.",
+        score=5,
+        justification='The output states that "Databricks is a company that specializes in big data'
+        ' and machine learning solutions." This claim is directly supported by the context, which '
+        'states "It was developed by Databricks, a company that specializes in big data and '
+        'machine learning solutions." Therefore, the faithfulness score is 5 as all the claims in '
+        'the output are directly supported by the provided context."',
         grading_context={
             "context": "MLflow is an open-source platform for managing the end-to-end "
             "machine learning (ML) lifecycle. It was developed by Databricks, a company "
@@ -211,36 +225,49 @@ class RelevanceMetric:
         },
     )
 
-    default_examples = [example_score_2, example_score_4]
+    default_examples = [example_score_2, example_score_5]
 
 
 @dataclass
-class StrictCorrectnessMetric:
+class AnswerCorrectnessMetric:
     definition = (
-        "When a question demands a specific value, term, or description (e.g., math questions or "
-        "fact-checking), correctness is binary. Strict correctness of the output is assessed on "
-        "whether it aligns exactly with the ground truth. Scores are assigned to be 0 or 1."
+        "Answer correctness is evaluated on the accuracy of the provided output based on the "
+        "provided targets, which is the ground truth. Scores can be assigned based on the degree "
+        "of semantic similarity and factual correctness of the provided output to the provided "
+        "targets, where a higher score indicates higher degree of accuracy."
     )
 
     grading_prompt = (
-        "Strict Correctness: Below are the details for different scores:"
-        "- Score 0: the output is completely incorrect, doesn't mention anything about the "
-        "question or is completely contrary to the ground truth."
-        "- Score 1: the output answers the question correctly as provided in the ground truth."
+        "Answer Correctness: Below are the details for different scores:\n"
+        "- Score 1: the output is completely incorrect. It is completely different from or "
+        "contradicts the provided targets.\n"
+        "- Score 2: the output demonstrates some degree of semantic similarity and includes "
+        "partially correct information. However, the output still has significant discrepancies "
+        "with the provided targets or inaccuracies.\n"
+        "- Score 3: the output addresses a couple of aspects of the input accurately, aligning "
+        "with the provided targets. However, there are still omissions or minor inaccuracies.\n"
+        "- Score 4: the output is mostly correct. It provides mostly accurate information, but "
+        "there may be one or more minor omissions or inaccuracies.\n"
+        "- Score 5: the output is correct. It demonstrates a high degree of accuracy and "
+        "semantic similarity to the targets."
     )
 
     grading_context_columns = ["targets"]
     parameters = default_parameters
     default_model = default_model
 
-    example_score_0 = EvaluationExample(
-        input="Is MLflow open-source?",
-        output="No, MLflow is not open-source.",
-        score=0,
-        justification="The output is incorrect. It states that MLflow is not open-source, which "
-        "contradicts the provided context, where it is explicitly mentioned that MLflow is an "
-        "open-source platform. This directly opposes the ground truth, resulting in a score of 0 "
-        "for strict correctness.",
+    example_score_2 = EvaluationExample(
+        input="How is MLflow related to Databricks?",
+        output="Databricks is a data engineering and analytics platform designed to help "
+        "organizations process and analyze large amounts of data. Databricks is a company "
+        "specializing in big data and machine learning solutions.",
+        score=2,
+        justification="The output provided by the model does demonstrate some degree of semantic "
+        "similarity to the targets, as it correctly identifies Databricks as a company "
+        "specializing in big data and machine learning solutions. However, it fails to address "
+        "the main point of the input question, which is the relationship between MLflow and "
+        "Databricks. The output does not mention MLflow at all, which is a significant discrepancy "
+        "with the provided targets. Therefore, the model's answer_correctness score is 2.",
         grading_context={
             "targets": "MLflow is an open-source platform for managing the end-to-end machine "
             "learning (ML) lifecycle. It was developed by Databricks, a company that specializes "
@@ -250,13 +277,17 @@ class StrictCorrectnessMetric:
         },
     )
 
-    example_score_1 = EvaluationExample(
-        input="Is MLflow open-source?",
-        output="MLflow is open-source, which means it's freely available for anyone to use.",
-        score=1,
-        justification="The output correctly states that MLflow is open-source, aligning perfectly "
-        "with the provided context. It accurately reflects the ground truth information, earning "
-        "a score of 1 for strict correctness.",
+    example_score_4 = EvaluationExample(
+        input="How is MLflow related to Databricks?",
+        output="MLflow is a product created by Databricks to enhance the efficiency of machine "
+        "learning processes.",
+        score=4,
+        justification="The output provided by the model is mostly correct. It correctly identifies "
+        "that MLflow is a product created by Databricks. However, it does not mention that MLflow "
+        "is an open-source platform for managing the end-to-end machine learning lifecycle, which "
+        "is a significant part of its function. Therefore, while the output is mostly accurate, "
+        "it has a minor omission, which is why it gets a score of 4 according to the grading "
+        "rubric.",
         grading_context={
             "targets": "MLflow is an open-source platform for managing the end-to-end machine "
             "learning (ML) lifecycle. It was developed by Databricks, a company that specializes "
@@ -266,4 +297,67 @@ class StrictCorrectnessMetric:
         },
     )
 
-    default_examples = [example_score_0, example_score_1]
+    default_examples = [example_score_2, example_score_4]
+
+
+@dataclass
+class AnswerRelevanceMetric:
+    definition = (
+        "Answer relevance measures the appropriateness and applicability of the output with "
+        "respect to the input. Scores should reflect the extent to which the output directly "
+        "addresses the question provided in the input, and give lower scores for incomplete or "
+        "redundant output."
+    )
+
+    grading_prompt = (
+        "Answer relevance: Please give a score from 1-5 based on the degree of relevance to the "
+        "input, where the lowest and highest scores are defined as follows:"
+        "- Score 1: the output doesn't mention anything about the question or is completely "
+        "irrelevant to the input.\n"
+        "- Score 5: the output addresses all aspects of the question and all parts of the output "
+        "are meaningful and relevant to the question."
+    )
+
+    grading_context_columns = ["context"]
+    parameters = default_parameters
+    default_model = default_model
+
+    example_score_2 = EvaluationExample(
+        input="How is MLflow related to Databricks?",
+        output="Databricks is a company that specializes in big data and machine learning "
+        "solutions.",
+        score=2,
+        justification="The output provided by the model does give some information about "
+        "Databricks, which is part of the input question. However, it does not address the main "
+        "point of the question, which is the relationship between MLflow and Databricks. "
+        "Therefore, while the output is not completely irrelevant, it does not fully answer the "
+        "question, leading to a lower score.",
+        grading_context={
+            "context": "MLflow is an open-source platform for managing the end-to-end machine "
+            "learning (ML) lifecycle. It was developed by Databricks, a company that specializes "
+            "in big data and machine learning solutions. MLflow is designed to address the "
+            "challenges that data scientists and machine learning engineers face when developing, "
+            "training, and deploying machine learning models."
+        },
+    )
+
+    example_score_5 = EvaluationExample(
+        input="How is MLflow related to Databricks?",
+        output="MLflow is a product created by Databricks to enhance the efficiency of machine "
+        "learning processes.",
+        score=5,
+        justification="The output directly addresses the input question by explaining the "
+        "relationship between MLflow and Databricks. It provides a clear and concise answer that "
+        "MLflow is a product created by Databricks, and also adds relevant information about the "
+        "purpose of MLflow, which is to enhance the efficiency of machine learning processes. "
+        "Therefore, the output is highly relevant to the input and deserves a full score.",
+        grading_context={
+            "context": "MLflow is an open-source platform for managing the end-to-end machine "
+            "learning (ML) lifecycle. It was developed by Databricks, a company that specializes "
+            "in big data and machine learning solutions. MLflow is designed to address the "
+            "challenges that data scientists and machine learning engineers face when developing, "
+            "training, and deploying machine learning models."
+        },
+    )
+
+    default_examples = [example_score_2, example_score_5]
diff --git a/mlflow/metrics/genai/utils.py b/mlflow/metrics/genai/utils.py
index 94966566685c0..3c9f43f1740de 100644
--- a/mlflow/metrics/genai/utils.py
+++ b/mlflow/metrics/genai/utils.py
@@ -1,2 +1,6 @@
 def _get_latest_metric_version():
     return "v1"
+
+
+def _get_default_model():
+    return "openai:/gpt-4"
diff --git a/mlflow/metrics/metric_definitions.py b/mlflow/metrics/metric_definitions.py
index 54b4d57c0973b..c547e973e93dd 100644
--- a/mlflow/metrics/metric_definitions.py
+++ b/mlflow/metrics/metric_definitions.py
@@ -1,3 +1,4 @@
+import functools
 import logging
 import os
 
@@ -7,6 +8,11 @@
 
 _logger = logging.getLogger(__name__)
 
+targets_col_specifier = "the column specified by the `targets` parameter"
+predictions_col_specifier = (
+    "the column specified by the `predictions` parameter or the model output column"
+)
+
 
 def standard_aggregations(scores):
     return {
@@ -16,31 +22,51 @@ def standard_aggregations(scores):
     }
 
 
-def _validate_text_data(data, metric_name, column_name):
-    """Validates that the data is text and is non-empty"""
-    if len(data) == 0:
+def _validate_text_data(data, metric_name, col_specifier):
+    """Validates that the data is a list of strs and is non-empty"""
+    if data is None or len(data) == 0:
         return False
 
     for row, line in enumerate(data):
         if not isinstance(line, str):
             _logger.warning(
                 f"Cannot calculate {metric_name} for non-string inputs. "
-                + f"Non-string found for {column_name} on row {row}. skipping metric logging."
+                f"Non-string found for {col_specifier} on row {row}. Skipping metric logging."
             )
             return False
 
     return True
 
 
-def _token_count_eval_fn(predictions, targets, metrics):
+def _validate_and_fix_text_tuple_data(data, metric_name, column_name):
+    """Validates that the data is a pandas Series of a tuple of strings and is non-empty"""
+    if data is None or len(data) == 0:
+        return False
+
+    for index, value in data.items():
+        if not isinstance(value, tuple) or not all(isinstance(val, str) for val in value):
+            # Single entry tuples are automatically unpacked by Pandas.
+            # So if the entry is a string, put it back into a tuple.
+            if isinstance(value, str):
+                data[index] = (value,)
+            else:
+                _logger.warning(
+                    f"Cannot calculate metric '{metric_name}' for non-tuple[str] inputs. "
+                    f"Row #{index} of column '{column_name}' has a non-tuple[str] value of:"
+                    f"{value}. Skipping metric logging."
+                )
+                return False
+
+    return True
+
+
+def _token_count_eval_fn(predictions, targets=None, metrics=None):
     import tiktoken
 
     # ref: https://github.com/openai/tiktoken/issues/75
     os.environ["TIKTOKEN_CACHE_DIR"] = ""
     encoding = tiktoken.get_encoding("cl100k_base")
 
-    _logger.info("Computing token count metric:")
-
     num_tokens = []
     for prediction in predictions:
         if isinstance(prediction, str):
@@ -53,21 +79,24 @@ def _token_count_eval_fn(predictions, targets, metrics):
     )
 
 
-def _toxicity_eval_fn(predictions, targets, metrics):
-    if not _validate_text_data(predictions, "toxicity", "predictions"):
+@functools.lru_cache(maxsize=8)
+def _cached_evaluate_load(path, module_type=None):
+    import evaluate
+
+    return evaluate.load(path, module_type=module_type)
+
+
+def _toxicity_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(predictions, "toxicity", predictions_col_specifier):
         return
     try:
-        _logger.info("Loading toxicity metric:")
-        import evaluate
-
-        toxicity = evaluate.load("toxicity", module_type="measurement")
+        toxicity = _cached_evaluate_load("toxicity", module_type="measurement")
     except Exception as e:
         _logger.warning(
             f"Failed to load 'toxicity' metric (error: {e!r}), skipping metric logging."
         )
         return
 
-    _logger.info("Computing toxicity metric:")
     scores = toxicity.compute(predictions=predictions)["toxicity"]
     toxicity_ratio = toxicity.compute(predictions=predictions, aggregation="ratio")[
         "toxicity_ratio"
@@ -81,31 +110,8 @@ def _toxicity_eval_fn(predictions, targets, metrics):
     )
 
 
-def _perplexity_eval_fn(predictions, targets, metrics):
-    if not _validate_text_data(predictions, "perplexity", "predictions"):
-        return
-
-    try:
-        _logger.info("Loading perplexity metric:")
-        import evaluate
-
-        perplexity = evaluate.load("perplexity", module_type="metric")
-    except Exception as e:
-        _logger.warning(
-            f"Failed to load 'perplexity' metric (error: {e!r}), skipping metric logging."
-        )
-        return
-
-    _logger.info("Computing perplexity metric:")
-    scores = perplexity.compute(predictions=predictions, model_id="gpt2")["perplexities"]
-    return MetricValue(
-        scores=scores,
-        aggregate_results=standard_aggregations(scores),
-    )
-
-
-def _flesch_kincaid_eval_fn(predictions, targets, metrics):
-    if not _validate_text_data(predictions, "flesch_kincaid", "predictions"):
+def _flesch_kincaid_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(predictions, "flesch_kincaid", predictions_col_specifier):
         return
 
     try:
@@ -114,7 +120,6 @@ def _flesch_kincaid_eval_fn(predictions, targets, metrics):
         _logger.warning("Failed to load flesch kincaid metric, skipping metric logging.")
         return
 
-    _logger.info("Computing flesch kincaid metric:")
     scores = [textstat.flesch_kincaid_grade(prediction) for prediction in predictions]
     return MetricValue(
         scores=scores,
@@ -122,8 +127,8 @@ def _flesch_kincaid_eval_fn(predictions, targets, metrics):
     )
 
 
-def _ari_eval_fn(predictions, targets, metrics):
-    if not _validate_text_data(predictions, "ari", "predictions"):
+def _ari_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(predictions, "ari", predictions_col_specifier):
         return
 
     try:
@@ -134,7 +139,6 @@ def _ari_eval_fn(predictions, targets, metrics):
         )
         return
 
-    _logger.info("Computing automated readability index metric:")
     scores = [textstat.automated_readability_index(prediction) for prediction in predictions]
     return MetricValue(
         scores=scores,
@@ -142,7 +146,7 @@ def _ari_eval_fn(predictions, targets, metrics):
     )
 
 
-def _accuracy_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _accuracy_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import accuracy_score
 
@@ -150,123 +154,103 @@ def _accuracy_eval_fn(predictions, targets, metrics, sample_weight=None):
         return MetricValue(aggregate_results={"exact_match": acc})
 
 
-def _rouge1_eval_fn(predictions, targets, metrics):
-    if targets is not None and len(targets) != 0:
-        if not _validate_text_data(targets, "rouge1", "targets") or not _validate_text_data(
-            predictions, "rouge1", "predictions"
-        ):
-            return
-
-        try:
-            import evaluate
-
-            rouge = evaluate.load("rouge")
-        except Exception as e:
-            _logger.warning(
-                f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging."
-            )
-            return
-
-        scores = rouge.compute(
-            predictions=predictions,
-            references=targets,
-            rouge_types=["rouge1"],
-            use_aggregator=False,
-        )["rouge1"]
-        return MetricValue(
-            scores=scores,
-            aggregate_results=standard_aggregations(scores),
-        )
+def _rouge1_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(targets, "rouge1", targets_col_specifier) or not _validate_text_data(
+        predictions, "rouge1", predictions_col_specifier
+    ):
+        return
 
+    try:
+        rouge = _cached_evaluate_load("rouge")
+    except Exception as e:
+        _logger.warning(f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging.")
+        return
 
-def _rouge2_eval_fn(predictions, targets, metrics):
-    if targets is not None and len(targets) != 0:
-        if not _validate_text_data(targets, "rouge2", "targets") or not _validate_text_data(
-            predictions, "rouge2", "predictions"
-        ):
-            return
+    scores = rouge.compute(
+        predictions=predictions,
+        references=targets,
+        rouge_types=["rouge1"],
+        use_aggregator=False,
+    )["rouge1"]
+    return MetricValue(
+        scores=scores,
+        aggregate_results=standard_aggregations(scores),
+    )
 
-        try:
-            import evaluate
 
-            rouge = evaluate.load("rouge")
-        except Exception as e:
-            _logger.warning(
-                f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging."
-            )
-            return
-
-        scores = rouge.compute(
-            predictions=predictions,
-            references=targets,
-            rouge_types=["rouge2"],
-            use_aggregator=False,
-        )["rouge2"]
-        return MetricValue(
-            scores=scores,
-            aggregate_results=standard_aggregations(scores),
-        )
+def _rouge2_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(targets, "rouge2", targets_col_specifier) or not _validate_text_data(
+        predictions, "rouge2", predictions_col_specifier
+    ):
+        return
 
+    try:
+        rouge = _cached_evaluate_load("rouge")
+    except Exception as e:
+        _logger.warning(f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging.")
+        return
 
-def _rougeL_eval_fn(predictions, targets, metrics):
-    if targets is not None and len(targets) != 0:
-        if not _validate_text_data(targets, "rougeL", "targets") or not _validate_text_data(
-            predictions, "rougeL", "predictions"
-        ):
-            return
+    scores = rouge.compute(
+        predictions=predictions,
+        references=targets,
+        rouge_types=["rouge2"],
+        use_aggregator=False,
+    )["rouge2"]
+    return MetricValue(
+        scores=scores,
+        aggregate_results=standard_aggregations(scores),
+    )
 
-        try:
-            import evaluate
 
-            rouge = evaluate.load("rouge")
-        except Exception as e:
-            _logger.warning(
-                f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging."
-            )
-            return
+def _rougeL_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(targets, "rougeL", targets_col_specifier) or not _validate_text_data(
+        predictions, "rougeL", predictions_col_specifier
+    ):
+        return
 
-        scores = rouge.compute(
-            predictions=predictions,
-            references=targets,
-            rouge_types=["rougeL"],
-            use_aggregator=False,
-        )["rougeL"]
-        return MetricValue(
-            scores=scores,
-            aggregate_results=standard_aggregations(scores),
-        )
+    try:
+        rouge = _cached_evaluate_load("rouge")
+    except Exception as e:
+        _logger.warning(f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging.")
+        return
 
+    scores = rouge.compute(
+        predictions=predictions,
+        references=targets,
+        rouge_types=["rougeL"],
+        use_aggregator=False,
+    )["rougeL"]
+    return MetricValue(
+        scores=scores,
+        aggregate_results=standard_aggregations(scores),
+    )
 
-def _rougeLsum_eval_fn(predictions, targets, metrics):
-    if targets is not None and len(targets) != 0:
-        if not _validate_text_data(targets, "rougeLsum", "targets") or not _validate_text_data(
-            predictions, "rougeLsum", "predictions"
-        ):
-            return
 
-        try:
-            import evaluate
+def _rougeLsum_eval_fn(predictions, targets=None, metrics=None):
+    if not _validate_text_data(
+        targets, "rougeLsum", targets_col_specifier
+    ) or not _validate_text_data(predictions, "rougeLsum", predictions_col_specifier):
+        return
 
-            rouge = evaluate.load("rouge")
-        except Exception as e:
-            _logger.warning(
-                f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging."
-            )
-            return
+    try:
+        rouge = _cached_evaluate_load("rouge")
+    except Exception as e:
+        _logger.warning(f"Failed to load 'rouge' metric (error: {e!r}), skipping metric logging.")
+        return
 
-        scores = rouge.compute(
-            predictions=predictions,
-            references=targets,
-            rouge_types=["rougeLsum"],
-            use_aggregator=False,
-        )["rougeLsum"]
-        return MetricValue(
-            scores=scores,
-            aggregate_results=standard_aggregations(scores),
-        )
+    scores = rouge.compute(
+        predictions=predictions,
+        references=targets,
+        rouge_types=["rougeLsum"],
+        use_aggregator=False,
+    )["rougeLsum"]
+    return MetricValue(
+        scores=scores,
+        aggregate_results=standard_aggregations(scores),
+    )
 
 
-def _mae_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _mae_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import mean_absolute_error
 
@@ -274,7 +258,7 @@ def _mae_eval_fn(predictions, targets, metrics, sample_weight=None):
         return MetricValue(aggregate_results={"mean_absolute_error": mae})
 
 
-def _mse_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _mse_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import mean_squared_error
 
@@ -282,7 +266,7 @@ def _mse_eval_fn(predictions, targets, metrics, sample_weight=None):
         return MetricValue(aggregate_results={"mean_squared_error": mse})
 
 
-def _rmse_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _rmse_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import mean_squared_error
 
@@ -290,7 +274,7 @@ def _rmse_eval_fn(predictions, targets, metrics, sample_weight=None):
         return MetricValue(aggregate_results={"root_mean_squared_error": rmse})
 
 
-def _r2_score_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _r2_score_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import r2_score
 
@@ -298,7 +282,7 @@ def _r2_score_eval_fn(predictions, targets, metrics, sample_weight=None):
         return MetricValue(aggregate_results={"r2_score": r2})
 
 
-def _max_error_eval_fn(predictions, targets, metrics):
+def _max_error_eval_fn(predictions, targets=None, metrics=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import max_error
 
@@ -306,7 +290,7 @@ def _max_error_eval_fn(predictions, targets, metrics):
         return MetricValue(aggregate_results={"max_error": error})
 
 
-def _mape_eval_fn(predictions, targets, metrics, sample_weight=None):
+def _mape_eval_fn(predictions, targets=None, metrics=None, sample_weight=None):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import mean_absolute_percentage_error
 
@@ -315,7 +299,7 @@ def _mape_eval_fn(predictions, targets, metrics, sample_weight=None):
 
 
 def _recall_eval_fn(
-    predictions, targets, metrics, pos_label=1, average="binary", sample_weight=None
+    predictions, targets=None, metrics=None, pos_label=1, average="binary", sample_weight=None
 ):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import recall_score
@@ -327,7 +311,7 @@ def _recall_eval_fn(
 
 
 def _precision_eval_fn(
-    predictions, targets, metrics, pos_label=1, average="binary", sample_weight=None
+    predictions, targets=None, metrics=None, pos_label=1, average="binary", sample_weight=None
 ):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import precision_score
@@ -343,7 +327,7 @@ def _precision_eval_fn(
 
 
 def _f1_score_eval_fn(
-    predictions, targets, metrics, pos_label=1, average="binary", sample_weight=None
+    predictions, targets=None, metrics=None, pos_label=1, average="binary", sample_weight=None
 ):
     if targets is not None and len(targets) != 0:
         from sklearn.metrics import f1_score
@@ -356,3 +340,25 @@ def _f1_score_eval_fn(
             sample_weight=sample_weight,
         )
         return MetricValue(aggregate_results={"f1_score": f1})
+
+
+def _precision_at_k_eval_fn(k):
+    def _fn(predictions, targets):
+        if not _validate_and_fix_text_tuple_data(
+            predictions, "precision_at_k", "predictions"
+        ) or not _validate_and_fix_text_tuple_data(targets, "precision_at_k", "targets"):
+            return
+
+        scores = []
+        for i in range(len(predictions)):
+            # only include the top k retrieved chunks
+            ground_truth, retrieved = set(targets[i]), predictions[i][:k]
+            relevant_doc_count = sum(1 for doc in retrieved if doc in ground_truth)
+            if len(retrieved) > 0:
+                scores.append(relevant_doc_count / len(retrieved))
+            else:
+                scores.append(1)
+
+        return MetricValue(scores=scores, aggregate_results=standard_aggregations(scores))
+
+    return _fn
diff --git a/mlflow/ml-package-versions.yml b/mlflow/ml-package-versions.yml
index 7aed33623a80e..4fdc2635d3fe1 100644
--- a/mlflow/ml-package-versions.yml
+++ b/mlflow/ml-package-versions.yml
@@ -126,7 +126,7 @@ xgboost:
       ">= 0.0.0": ["scikit-learn"]
       "< 1.6": ["pandas<2"]
     run: |
-      pytest tests/xgboost/test_xgboost_model_export.py
+      dev/pytest.sh tests/xgboost/test_xgboost_model_export.py
 
   autologging:
     minimum: "1.4.2"
@@ -135,7 +135,7 @@ xgboost:
       ">= 0.0.0": ["scikit-learn", "matplotlib"]
       "< 1.6": ["pandas<2"]
     run: |
-      pytest tests/xgboost/test_xgboost_autolog.py
+      dev/pytest.sh tests/xgboost/test_xgboost_autolog.py
 
 lightgbm:
   package_info:
diff --git a/mlflow/models/docker_utils.py b/mlflow/models/docker_utils.py
index 30931fde329a1..734f0eca0ec44 100644
--- a/mlflow/models/docker_utils.py
+++ b/mlflow/models/docker_utils.py
@@ -105,8 +105,8 @@ def _get_maven_proxy():
 
 {custom_setup_steps}
 
-# granting read/write access and conditional execution authority to all child directories 
-# and files to allow for deployment to AWS Sagemaker Serverless Endpoints 
+# granting read/write access and conditional execution authority to all child directories
+# and files to allow for deployment to AWS Sagemaker Serverless Endpoints
 # (see https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints.html)
 RUN chmod o+rwX /opt/mlflow/
 
diff --git a/mlflow/models/evaluation/base.py b/mlflow/models/evaluation/base.py
index 47c81b0e17940..9b49fd5183ca3 100644
--- a/mlflow/models/evaluation/base.py
+++ b/mlflow/models/evaluation/base.py
@@ -1,4 +1,3 @@
-import hashlib
 import json
 import logging
 import math
@@ -15,7 +14,7 @@
 from contextlib import contextmanager
 from decimal import Decimal
 from types import FunctionType
-from typing import Any, Dict, Optional
+from typing import Any, Dict, List, Optional
 
 import mlflow
 from mlflow.data.dataset import Dataset
@@ -31,7 +30,7 @@
 from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
 from mlflow.tracking.client import MlflowClient
-from mlflow.utils import _get_fully_qualified_class_name
+from mlflow.utils import _get_fully_qualified_class_name, insecure_hash
 from mlflow.utils.annotations import developer_stable, experimental
 from mlflow.utils.class_utils import _get_class_from_string
 from mlflow.utils.file_utils import TempDir
@@ -55,7 +54,7 @@ class _ModelType:
     QUESTION_ANSWERING = "question-answering"
     TEXT_SUMMARIZATION = "text-summarization"
     TEXT = "text"
-    # TODO: Add 'retrieval' model type
+    RETRIEVER = "retriever"
 
     def __init__(self):
         raise NotImplementedError("This class is not meant to be instantiated.")
@@ -68,6 +67,7 @@ def values(cls):
             cls.QUESTION_ANSWERING,
             cls.TEXT_SUMMARIZATION,
             cls.TEXT,
+            cls.RETRIEVER,
         )
 
 
@@ -110,23 +110,30 @@ def eval_fn(
     :param long_name: (Optional) The long name of the metric. For example,
         ``"root_mean_squared_error"`` for ``"mse"``.
     :param version: (Optional) The metric version. For example ``v1``.
+    :param metric_details: (Optional) A description of the metric and how it is calculated.
     '''
 
-    def __init__(self, eval_fn, name, greater_is_better, long_name=None, version=None):
+    def __init__(
+        self, eval_fn, name, greater_is_better, long_name=None, version=None, metric_details=None
+    ):
         self.eval_fn = eval_fn
         self.name = name
         self.greater_is_better = greater_is_better
         self.long_name = long_name or name
         self.version = version
+        self.metric_details = metric_details
 
     def __str__(self):
+        parts = [f"name={self.name}, greater_is_better={self.greater_is_better}"]
+
         if self.long_name:
-            return (
-                f"EvaluationMetric(name={self.name}, long_name={self.long_name}, "
-                f"greater_is_better={self.greater_is_better})"
-            )
-        else:
-            return f"EvaluationMetric(name={self.name}, greater_is_better={self.greater_is_better})"
+            parts.append(f"long_name={self.long_name}")
+        if self.version:
+            parts.append(f"version={self.version}")
+        if self.metric_details:
+            parts.append(f"metric_details={self.metric_details}")
+
+        return "EvaluationMetric(" + ", ".join(parts) + ")"
 
 
 def make_metric(
@@ -136,6 +143,7 @@ def make_metric(
     name=None,
     long_name=None,
     version=None,
+    metric_details=None,
 ):
     '''
     A factory function to create an :py:class:`EvaluationMetric` object.
@@ -176,6 +184,7 @@ def eval_fn(
     :param long_name: (Optional) The long name of the metric. For example, ``"mean_squared_error"``
         for ``"mse"``.
     :param version: (Optional) The metric version. For example ``v1``.
+    :param metric_details: (Optional) A description of the metric and how it is calculated.
 
     .. seealso::
 
@@ -195,7 +204,7 @@ def eval_fn(
             )
         name = eval_fn.__name__
 
-    return EvaluationMetric(eval_fn, name, greater_is_better, long_name, version)
+    return EvaluationMetric(eval_fn, name, greater_is_better, long_name, version, metric_details)
 
 
 @developer_stable
@@ -597,7 +606,7 @@ def __init__(
             )
 
         # generate dataset hash
-        md5_gen = hashlib.md5()
+        md5_gen = insecure_hash.md5()
         _gen_md5_for_arraylike_obj(md5_gen, self._features_data)
         if self._labels_data is not None:
             _gen_md5_for_arraylike_obj(md5_gen, self._labels_data)
@@ -656,7 +665,7 @@ def has_predictions(self):
     @property
     def predictions_name(self):
         """
-        return targets name
+        return predictions name
         """
         return self._predictions_name
 
@@ -770,6 +779,7 @@ def evaluate(
         extra_metrics=None,
         custom_artifacts=None,
         baseline_model=None,
+        predictions=None,
         **kwargs,
     ):
         """
@@ -794,6 +804,9 @@ def evaluate(
                                           flavor as a baseline model to be compared with the
                                           candidate model (specified by the `model` param) for model
                                           validation. (pyfunc model instance is not allowed)
+        :param predictions: The column name of the model output column that is used for evaluation.
+                            This is only used when a model returns a pandas dataframe that contains
+                            multiple columns.
         :return: A :py:class:`mlflow.models.EvaluationResult` instance containing
                  evaluation metrics for candidate model and baseline model and
                  artifacts for candidate model.
@@ -1067,6 +1080,7 @@ def _evaluate(
     extra_metrics,
     custom_artifacts,
     baseline_model,
+    predictions,
 ):
     """
     The public API "evaluate" will verify argument first, and then pass normalized arguments
@@ -1108,6 +1122,7 @@ def _evaluate(
                 extra_metrics=extra_metrics,
                 custom_artifacts=custom_artifacts,
                 baseline_model=baseline_model,
+                predictions=predictions,
             )
             eval_results.append(eval_result)
 
@@ -1145,7 +1160,7 @@ def predict(self, context, model_input: pd.DataFrame):
 
 
 def evaluate(
-    model: Optional[str] = None,
+    model=None,
     data=None,
     *,
     model_type: Optional[str] = None,
@@ -1156,7 +1171,7 @@ def evaluate(
     evaluators=None,
     evaluator_config=None,
     custom_metrics=None,
-    extra_metrics=None,
+    extra_metrics: Optional[List[EvaluationMetric]] = None,
     custom_artifacts=None,
     validation_thresholds=None,
     baseline_model=None,
@@ -1198,16 +1213,11 @@ def evaluate(
           precision_recall_auc), precision-recall merged curves plot, ROC merged curves plot.
 
      - For question-answering models, the default evaluator logs:
-        - **metrics**: ``exact_match``, `mean_perplexity`_ (requires `evaluate`_, `pytorch`_,
-          `transformers`_), `toxicity_ratio`_ (requires `evaluate`_, `pytorch`_, `transformers`_),
-          `mean_ari_grade_level`_ (requires `textstat`_), `mean_flesch_kincaid_grade_level`_
-          (requires `textstat`_).
+        - **metrics**: ``exact_match``, ``token_count``, `toxicity_ratio`_ (requires `evaluate`_,
+          `pytorch`_, `mean_flesch_kincaid_grade_level`_ (requires `textstat`_).
         - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
           argument is supplied), and per-row metrics of the model in tabular format.
 
-        .. _mean_perplexity:
-            https://huggingface.co/spaces/evaluate-metric/perplexity
-
         .. _toxicity_ratio:
             https://huggingface.co/spaces/evaluate-measurement/toxicity
 
@@ -1230,20 +1240,16 @@ def evaluate(
             https://pypi.org/project/textstat
 
      - For text-summarization models, the default evaluator logs:
-        - **metrics**: `ROUGE`_ (requires `evaluate`_, `nltk`_, and `rouge_score`_ to be installed),
-          `mean_perplexity`_ (requires `evaluate`_, `pytorch`_,
-          `transformers`_), `toxicity_ratio`_ (requires `evaluate`_, `pytorch`_, `transformers`_),
-          `mean_ari_grade_level`_ (requires `textstat`_), `mean_flesch_kincaid_grade_level`_
-          (requires `textstat`_).
+        - **metrics**: ``token_count``, `ROUGE`_ (requires `evaluate`_, `nltk`_, and
+          `rouge_score`_ to be installed), `toxicity_ratio`_ (requires `evaluate`_, `pytorch`_,
+          `transformers`_), `mean_ari_grade_level`_ (requires `textstat`_),
+          `mean_flesch_kincaid_grade_level`_ (requires `textstat`_).
         - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
           argument is supplied), and per-row metrics of the model in the tabular format.
 
         .. _ROUGE:
             https://huggingface.co/spaces/evaluate-metric/rouge
 
-        .. _mean_perplexity:
-            https://huggingface.co/spaces/evaluate-metric/perplexity
-
         .. _toxicity_ratio:
             https://huggingface.co/spaces/evaluate-measurement/toxicity
 
@@ -1272,19 +1278,15 @@ def evaluate(
             https://pypi.org/project/textstat
 
      - For text models, the default evaluator logs:
-        - **metrics**: `mean_perplexity`_ (requires `evaluate`_, `pytorch`_,
-          `transformers`_), `toxicity_ratio`_ (requires `evaluate`_, `pytorch`_, `transformers`_),
-          `mean_ari_grade_level`_ (requires `textstat`_), `mean_flesch_kincaid_grade_level`_
-          (requires `textstat`_).
+        - **metrics**: ``token_count``, `toxicity_ratio`_ (requires `evaluate`_, `pytorch`_,
+          `transformers`_), `mean_ari_grade_level`_ (requires `textstat`_),
+          `mean_flesch_kincaid_grade_level`_ (requires `textstat`_).
         - **artifacts**: A JSON file containing the inputs, outputs, targets (if the ``targets``
           argument is supplied), and per-row metrics of the model in tabular format.
 
         .. _evaluate:
             https://pypi.org/project/evaluate
 
-        .. _mean_perplexity:
-            https://huggingface.co/spaces/evaluate-metric/perplexity
-
         .. _toxicity_ratio:
             https://huggingface.co/spaces/evaluate-measurement/toxicity
 
@@ -1303,6 +1305,13 @@ def evaluate(
         .. _textstat:
             https://pypi.org/project/textstat
 
+     - For retriever models, the default evaluator logs:
+        - **metrics**: ``precision_at_k``: has a default value of k = 3. To use a different
+          value for k, include ``"k"`` in the ``evaluator_config`` parameter:
+          ``evaluator_config={"k":5}``.
+        - **artifacts**: A JSON file containing the inputs, outputs, targets, and per-row metrics
+          of the model in tabular format.
+
      - For sklearn models, the default evaluator additionally logs the model's evaluation criterion
        (e.g. mean accuracy for a classifier) computed by `model.score` method.
 
@@ -1346,6 +1355,9 @@ def evaluate(
           metrics.
         - **col_mapping**: A dictionary mapping column names in the input dataset or output
           predictions to column names used when invoking the evaluation functions.
+        - **k**: The number of top-ranked retrieved documents to use when computing the built-in
+          metric ``precision_at_k`` for model_type="retriever". Default value is 3. For all other
+          model types, this parameter will be ignored.
 
      - Limitations of evaluation dataset:
         - For classification tasks, dataset labels are used to infer the total number of classes.
@@ -1430,8 +1442,32 @@ def fn(model_input):
                     ``data`` is a :py:class:`mlflow.data.dataset.Dataset` that defines targets,
                     then ``targets`` is optional.
 
-    :param predictions: Optional. Only used when ``model`` is not specified and ``data`` is a pandas
-                        dataframe. The name of the column in ``data`` that contains model outputs.
+    :param predictions: Optional. The name of the column that contains model outputs. There are two
+                        cases where this argument is required:
+
+                        - When ``model`` is specified and outputs multiple columns. The
+                          ``predictions`` should be the name of the column that is used for
+                          evaluation.
+                        - When ``model`` is not specified and ``data`` is a pandas dataframe. The
+                          ``predictions`` should be the name of the column in ``data`` that
+                          contains model outputs.
+
+        .. code-block:: python
+            :caption: Example usage of predictions
+
+            # Evaluate a model that outputs multiple columns
+            data = pd.DataFrame({"question": ["foo"]})
+
+
+            def model(inputs):
+                return pd.DataFrame({"answer": ["bar"], "source": ["baz"]})
+
+
+            results = evalaute(model=model, data=data, predictions="answer", ...)
+
+            # Evaluate a static dataset
+            data = pd.DataFrame({"question": ["foo"], "answer": ["bar"], "source": ["baz"]})
+            results = evalaute(data=data, predictions="answer", ...)
 
     :param model_type: (Optional) A string describing the model type. The default evaluator
                        supports the following model types:
@@ -1441,13 +1477,15 @@ def fn(model_input):
                        - ``'question-answering'``
                        - ``'text-summarization'``
                        - ``'text'``
+                       - ``'retriever'``
 
                        If no ``model_type`` is specified, then you must provide a a list of
-                       metrics to compute via the``extra_metrics`` param.
+                       metrics to compute via the ``extra_metrics`` param.
 
                        .. note::
-                            ``'question-answering'``, ``'text-summarization'``, and ``'text'``
-                            are experimental and may be changed or removed in a future release.
+                            ``'question-answering'``, ``'text-summarization'``, ``'text'``, and
+                            ``'retriever'`` are experimental and may be changed or removed in a
+                            future release.
 
     :param dataset_path: (Optional) The path where the data is stored. Must not contain double
                          quotes (``“``). If specified, the path is logged to the ``mlflow.datasets``
@@ -1475,10 +1513,10 @@ def fn(model_input):
     :param extra_metrics:
         (Optional) A list of :py:class:`EvaluationMetric <mlflow.models.EvaluationMetric>` objects.
         See the `mlflow.metrics` module for more information about the
-        builtin metrics and how to define custom metrics
+        builtin metrics and how to define extra metrics
 
         .. code-block:: python
-            :caption: Example usage of custom metrics
+            :caption: Example usage of extra metrics
 
             import mlflow
             import numpy as np
@@ -1630,18 +1668,53 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
     from mlflow.pyfunc import PyFuncModel, _load_model_or_server, _ServedPyFuncModel
     from mlflow.utils import env_manager as _EnvManager
 
+    if evaluator_config is not None:
+        col_mapping = evaluator_config.get("col_mapping", {})
+
+        if isinstance(targets, str):
+            targets = col_mapping.get(targets, targets)
+
+        if isinstance(predictions, str):
+            predictions = col_mapping.get(predictions, predictions)
+
     if data is None:
         raise MlflowException(
             message="The data argument cannot be None.", error_code=INVALID_PARAMETER_VALUE
         )
 
-    if predictions is not None and model is not None:
+    _EnvManager.validate(env_manager)
+
+    # If Dataset is provided, the targets can only be specified by the Dataset,
+    # not the targets parameters of the mlflow.evaluate() API.
+    if isinstance(data, Dataset) and targets is not None:
         raise MlflowException(
-            message="The predictions argument cannot be specified when model is specified.",
+            message="The top-level targets parameter should not be specified since a Dataset "
+            "is used. Please only specify the targets column name in the Dataset. For example: "
+            "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. "
+            "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`.",
+            error_code=INVALID_PARAMETER_VALUE,
+        )
+    # If Dataset is provided and model is None, then the predictions can only be specified by the
+    # Dataset, not the predictions parameters of the mlflow.evaluate() API.
+    if isinstance(data, Dataset) and model is None and predictions is not None:
+        raise MlflowException(
+            message="The top-level predictions parameter should not be specified since a Dataset "
+            "is used. Please only specify the predictions column name in the Dataset. For example:"
+            " `data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`"
+            "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`.",
+            error_code=INVALID_PARAMETER_VALUE,
+        )
+    # If Dataset is provided and model is specified, then the data.predictions cannot be specified.
+    if (
+        isinstance(data, Dataset)
+        and model is not None
+        and getattr(data, "predictions", None) is not None
+    ):
+        raise MlflowException(
+            message="The predictions parameter should not be specified in the Dataset since a "
+            "model is specified. Please remove the predictions column from the Dataset.",
             error_code=INVALID_PARAMETER_VALUE,
         )
-
-    _EnvManager.validate(env_manager)
 
     if model_type in [_ModelType.REGRESSOR, _ModelType.CLASSIFIER]:
         if isinstance(data, Dataset):
@@ -1649,8 +1722,9 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
                 targets = data.targets
             else:
                 raise MlflowException(
-                    message="The targets argument is required when data is a Dataset and does not "
-                    "define targets.",
+                    message="The targets column name must be specified in the provided Dataset "
+                    f"for {model_type} models. For example: "
+                    "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`",
                     error_code=INVALID_PARAMETER_VALUE,
                 )
         else:
@@ -1695,29 +1769,13 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
                     "parameter when model=None.",
                     error_code=INVALID_PARAMETER_VALUE,
                 )
-            if predictions not in data.columns:
-                raise MlflowException(
-                    message=f"The specified predictions column '{predictions}' is not "
-                    "found in the specified data.",
-                    error_code=INVALID_PARAMETER_VALUE,
-                )
         elif isinstance(data, mlflow.data.pandas_dataset.PandasDataset):
-            # If data is a mlflow PandasDataset with predictions specified
-            # check that exact one predictions column is specified
-            if data.predictions is not None:
-                if predictions is not None and predictions != data.predictions:
-                    raise MlflowException(
-                        message="The predictions parameter must be None or the same as "
-                        "data.predictions when data.predictions is specified. Found "
-                        f"predictions='{predictions}', data.predictions='{data.predictions}'.",
-                        error_code=INVALID_PARAMETER_VALUE,
-                    )
-                else:  # predictions is None or predictions == data.predictions
-                    pass  # OK: exact one predictions column is specified
-            else:
+            # If data is a mlflow PandasDataset, data.predictions must be specified
+            if data.predictions is None:
                 raise MlflowException(
                     message="The predictions parameter must be specified with the provided "
-                    "PandasDataset when model=None.",
+                    "PandasDataset when model=None. For example: "
+                    "`data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`",
                     error_code=INVALID_PARAMETER_VALUE,
                 )
         else:
@@ -1777,9 +1835,12 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
     with _start_run_or_reuse_active_run() as run_id:
         if not isinstance(data, Dataset):
             # Convert data to `mlflow.data.dataset.Dataset`.
-            data = _convert_data_to_mlflow_dataset(
-                data=data, targets=targets, predictions=predictions
-            )
+            if model is None:
+                data = _convert_data_to_mlflow_dataset(
+                    data=data, targets=targets, predictions=predictions
+                )
+            else:
+                data = _convert_data_to_mlflow_dataset(data=data, targets=targets)
 
         from mlflow.data.pyfunc_dataset_mixin import PyFuncConvertibleDatasetMixin
 
@@ -1800,6 +1861,7 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
                 path=dataset_path,
                 feature_names=feature_names,
             )
+        predictions_expected_in_model_output = predictions if model is not None else None
 
         try:
             evaluate_result = _evaluate(
@@ -1813,6 +1875,7 @@ def pred_sample(eval_df, _builtin_metrics, _artifacts_dir):
                 extra_metrics=extra_metrics,
                 custom_artifacts=custom_artifacts,
                 baseline_model=baseline_model,
+                predictions=predictions_expected_in_model_output,
             )
         finally:
             if isinstance(model, _ServedPyFuncModel):
diff --git a/mlflow/models/evaluation/default_evaluator.py b/mlflow/models/evaluation/default_evaluator.py
index 95658ec51f75b..a972dff8fb1b5 100644
--- a/mlflow/models/evaluation/default_evaluator.py
+++ b/mlflow/models/evaluation/default_evaluator.py
@@ -9,6 +9,7 @@
 import shutil
 import tempfile
 import time
+import traceback
 import warnings
 from collections import namedtuple
 from functools import partial
@@ -25,11 +26,12 @@
 from mlflow.entities.metric import Metric
 from mlflow.exceptions import MlflowException
 from mlflow.metrics import (
+    EvaluationMetric,
     MetricValue,
     ari_grade_level,
     exact_match,
     flesch_kincaid_grade_level,
-    perplexity,
+    precision_at_k,
     rouge1,
     rouge2,
     rougeL,
@@ -61,7 +63,6 @@
 
 _DEFAULT_SAMPLE_ROWS_FOR_SHAP = 2000
 _EVAL_TABLE_FILE_NAME = "eval_results_table.json"
-_Y_PREDICTED_OUTPUT_COLUMN_NAME = "predicted_column"
 _TOKEN_COUNT_METRIC_NAME = "token_count"
 _LATENCY_METRIC_NAME = "latency"
 
@@ -444,6 +445,10 @@ def _get_aggregate_metrics_values(metrics):
 def _extract_output_and_other_columns(model_predictions, output_column_name):
     y_pred = None
     other_output_columns = None
+    ERROR_MISSING_OUTPUT_COLUMN_NAME = (
+        "Output column name is not specified for the multi-output model. "
+        "Please set the correct output column name using the `predictions` parameter."
+    )
 
     if isinstance(model_predictions, list) and all(isinstance(p, dict) for p in model_predictions):
         # Extract 'y_pred' and 'other_output_columns' from list of dictionaries
@@ -455,20 +460,32 @@ def _extract_output_and_other_columns(model_predictions, output_column_name):
                 [{k: v for k, v in p.items() if k != output_column_name} for p in model_predictions]
             )
         elif len(model_predictions) > 1:
+            if output_column_name is None:
+                raise MlflowException(
+                    ERROR_MISSING_OUTPUT_COLUMN_NAME,
+                    error_code=INVALID_PARAMETER_VALUE,
+                )
             raise MlflowException(
                 f"Output column name '{output_column_name}' is not found in the model "
                 f"predictions list: {model_predictions}. Please set the correct output column "
-                "name using the `predicted_column` parameter in evaluator config."
+                "name using the `predictions` parameter.",
+                error_code=INVALID_PARAMETER_VALUE,
             )
     elif isinstance(model_predictions, pd.DataFrame):
         if output_column_name in model_predictions.columns:
             y_pred = model_predictions[output_column_name]
             other_output_columns = model_predictions.drop(columns=output_column_name)
         elif model_predictions.shape[1] > 1:
+            if output_column_name is None:
+                raise MlflowException(
+                    ERROR_MISSING_OUTPUT_COLUMN_NAME,
+                    error_code=INVALID_PARAMETER_VALUE,
+                )
             raise MlflowException(
                 f"Output column name '{output_column_name}' is not found in the model "
                 f"predictions dataframe {model_predictions.columns}. Please set the correct "
-                "output column name using the `predicted_column` parameter in evaluator config."
+                "output column name using the `predictions` parameter.",
+                error_code=INVALID_PARAMETER_VALUE,
             )
     elif isinstance(model_predictions, dict):
         if output_column_name in model_predictions:
@@ -477,10 +494,16 @@ def _extract_output_and_other_columns(model_predictions, output_column_name):
                 {k: v for k, v in model_predictions.items() if k != output_column_name}
             )
         elif len(model_predictions) > 1:
+            if output_column_name is None:
+                raise MlflowException(
+                    ERROR_MISSING_OUTPUT_COLUMN_NAME,
+                    error_code=INVALID_PARAMETER_VALUE,
+                )
             raise MlflowException(
                 f"Output column name '{output_column_name}' is not found in the "
                 f"model predictions dict {model_predictions}. Please set the correct "
-                "output column name using the `predicted_column` parameter in evaluator config."
+                "output column name using the `predictions` parameter.",
+                error_code=INVALID_PARAMETER_VALUE,
             )
 
     return y_pred if y_pred is not None else model_predictions, other_output_columns
@@ -1130,60 +1153,61 @@ def _get_args_for_metrics(self, extra_metric, eval_df):
         input_df = self.X.copy_to_avoid_mutation()
         parameters = inspect.signature(extra_metric.eval_fn).parameters
         eval_fn_args = []
+        params_not_found = []
+        # eval_fn has parameters (eval_df, builtin_metrics) for backwards compatibility
         if len(parameters) == 2:
+            param_0_name, param_1_name = parameters.keys()
+        if len(parameters) == 2 and param_0_name != "predictions" and param_1_name != "targets":
             eval_fn_args.append(eval_df_copy)
-            if "metrics" in parameters.keys():
-                eval_fn_args.append(copy.deepcopy(self.metrics_values))
-            else:
-                eval_fn_args.append(copy.deepcopy(self.metrics))
+            eval_fn_args.append(copy.deepcopy(self.metrics))
+        # eval_fn can have parameters like (predictions, targets, metrics, random_col)
         else:
             for param_name, param in parameters.items():
-                if param_name == "predictions":
+                column = self.col_mapping.get(param_name, param_name)
+
+                if column == "predictions" or column == self.dataset.predictions_name:
                     eval_fn_args.append(eval_df_copy["prediction"])
-                elif param_name == "targets":
+                elif column == "targets" or column == self.dataset.targets_name:
                     if "target" in eval_df_copy:
                         eval_fn_args.append(eval_df_copy["target"])
                     else:
-                        eval_fn_args.append(None)
-                elif param_name == "metrics":
+                        if param.default == inspect.Parameter.empty:
+                            params_not_found.append(param_name)
+                        else:
+                            eval_fn_args.append(param.default)
+                elif column == "metrics":
                     eval_fn_args.append(copy.deepcopy(self.metrics_values))
                 else:
-                    column = self.col_mapping.get(param_name, param_name)
+                    # case when column passed in col_mapping contains the entire column
                     if not isinstance(column, str):
                         eval_fn_args.append(column)
+
+                    # case column in col_mapping is string and the column value
+                    # is part of the input_df
                     elif column in input_df.columns:
                         eval_fn_args.append(input_df[column])
+
+                    # case column in col_mapping is string and the column value
+                    # is part of the output_df(other than predictions)
                     elif (
                         self.other_output_columns is not None
                         and column in self.other_output_columns.columns
                     ):
                         eval_fn_args.append(self.other_output_columns[column])
+
+                    # case where the param is defined as part of the evaluator_config
+                    elif column in self.evaluator_config:
+                        eval_fn_args.append(self.evaluator_config.get(column))
                     elif param.default == inspect.Parameter.empty:
-                        output_column_name = self.evaluator_config.get(
-                            _Y_PREDICTED_OUTPUT_COLUMN_NAME, "output"
-                        )
-                        output_columns = list(self.other_output_columns.columns)
-                        input_columns = list(input_df.columns)
-                        raise MlflowException(
-                            "Error: Metric Calculation Failed\n"
-                            f"Metric '{extra_metric.name}' requires the column '{param_name}' to "
-                            "be defined in either the input data or resulting output data.\n\n"
-                            "Below are the existing column names for the input/output data:\n"
-                            f"Input Columns: {input_columns}\n"
-                            f"Output Columns: {output_columns}\n"
-                            "Note that this does not include the output column: "
-                            f"'{output_column_name}'\n\n"
-                            f"To resolve this issue, you may want to map {param_name} to an "
-                            "existing column using the following configuration:\n"
-                            f"evaluator_config={{'col_mapping': {{'{param_name}': "
-                            "'<existing column name>'}}\n"
-                        )
+                        params_not_found.append(param_name)
+                    else:
+                        eval_fn_args.append(param.default)
 
+        if len(params_not_found) > 0:
+            return extra_metric.name, params_not_found
         return eval_fn_args
 
     def _evaluate_extra_metrics(self, eval_df):
-        if not self.extra_metrics:
-            return
         for index, extra_metric in enumerate(self.extra_metrics):
             eval_fn_args = self._get_args_for_metrics(extra_metric, eval_df)
             _logger.info(f"Evaluating metrics: {extra_metric.name}")
@@ -1307,20 +1331,30 @@ def predict_with_latency(X_copy):
                 )
 
         X_copy = self.X.copy_to_avoid_mutation()
-        if compute_latency:
-            model_predictions = predict_with_latency(X_copy)
-        else:
-            if self.model is not None:
-                model_predictions = self.model.predict(X_copy)
+        if self.model is not None:
+            _logger.info("Computing model predictions.")
+
+            if compute_latency:
+                model_predictions = predict_with_latency(X_copy)
             else:
-                if self.dataset.predictions_data is None:
-                    raise MlflowException(
-                        message="Predictions data is missing when model is not provided. "
-                        "Please provide predictions data in the pandas dataset or provide "
-                        "a model.",
-                        error_code=INVALID_PARAMETER_VALUE,
-                    )
-                model_predictions = self.dataset.predictions_data
+                model_predictions = self.model.predict(X_copy)
+        else:
+            if self.dataset.predictions_data is None:
+                raise MlflowException(
+                    message="Predictions data is missing when model is not provided. "
+                    "Please provide predictions data in a dataset or provide a model. "
+                    "See the documentation for mlflow.evaluate() for how to specify "
+                    "the predictions data in a dataset.",
+                    error_code=INVALID_PARAMETER_VALUE,
+                )
+            if compute_latency:
+                _logger.warning(
+                    "Setting the latency to 0 for all entries because the model " "is not provided."
+                )
+                self.metrics_values.update(
+                    {_LATENCY_METRIC_NAME: MetricValue(scores=[0.0] * len(X_copy))}
+                )
+            model_predictions = self.dataset.predictions_data
 
         if self.model_type == _ModelType.CLASSIFIER:
             self.label_list = np.unique(self.y)
@@ -1355,7 +1389,7 @@ def predict_with_latency(X_copy):
             else:
                 self.y_probs = None
 
-        output_column_name = self.evaluator_config.get(_Y_PREDICTED_OUTPUT_COLUMN_NAME, "output")
+        output_column_name = self.predictions
         self.y_pred, self.other_output_columns = _extract_output_and_other_columns(
             model_predictions, output_column_name
         )
@@ -1401,9 +1435,85 @@ def _compute_builtin_metrics(self):
                 )
             )
 
+    def _check_args(self, metrics, eval_df):
+        failed_metrics = []
+        # collect all failures for getting metric arguments
+        for metric in metrics:
+            result = self._get_args_for_metrics(metric, eval_df)
+            if isinstance(result, tuple):
+                failed_metrics.append(result)
+
+        if len(failed_metrics) > 0:
+            output_columns = (
+                [] if self.other_output_columns is None else list(self.other_output_columns.columns)
+            )
+            input_columns = list(self.X.copy_to_avoid_mutation().columns)
+
+            error_messages = [
+                f"Metric '{metric_name}' requires the columns {param_names}"
+                for metric_name, param_names in failed_metrics
+            ]
+            joined_error_message = "\n".join(error_messages)
+            full_message = f"""Error: Metric calculation failed for the following metrics:
+            {joined_error_message}
+
+            Below are the existing column names for the input/output data:
+            Input Columns: {input_columns}
+            Output Columns: {output_columns}
+            To resolve this issue, you may want to map the missing column to an existing column
+            using the following configuration:
+            evaluator_config={{'col_mapping': {{<missing column name>: <existing column name>}}}}"""
+            stripped_message = "\n".join(l.lstrip() for l in full_message.splitlines())
+            raise MlflowException(stripped_message)
+
+    def _test_first_row(self, eval_df):
+        # test calculations on first row of eval_df
+        exceptions = []
+        first_row_df = eval_df.iloc[[0]]
+        for metric in self.builtin_metrics:
+            try:
+                eval_fn_args = self._get_args_for_metrics(metric, first_row_df)
+                metric_value = metric.eval_fn(*eval_fn_args)
+
+                # need to update metrics because they might be used in calculating extra_metrics
+                if metric_value:
+                    name = f"{metric.name}/{metric.version}" if metric.version else metric.name
+                    self.metrics_values.update({name: metric_value})
+            except Exception as e:
+                stacktrace_str = traceback.format_exc()
+                if isinstance(e, MlflowException):
+                    exceptions.append(
+                        f"Metric '{metric.name}': Error:\n{e.message}\n{stacktrace_str}"
+                    )
+                else:
+                    exceptions.append(f"Metric '{metric.name}': Error:\n{e!r}\n{stacktrace_str}")
+        self._update_metrics()
+        for metric in self.extra_metrics:
+            try:
+                eval_fn_args = self._get_args_for_metrics(metric, first_row_df)
+                metric.eval_fn(*eval_fn_args)
+            except Exception as e:
+                stacktrace_str = traceback.format_exc()
+                if isinstance(e, MlflowException):
+                    exceptions.append(
+                        f"Metric '{metric.name}': Error:\n{e.message}\n{stacktrace_str}"
+                    )
+                else:
+                    exceptions.append(f"Metric '{metric.name}': Error:\n{e!r}\n{stacktrace_str}")
+
+        if len(exceptions) > 0:
+            raise MlflowException("\n".join(exceptions))
+
+    def _evaluate_metrics(self, eval_df):
+        self._check_args(self.builtin_metrics + self.extra_metrics, eval_df)
+        self._test_first_row(eval_df)
+
+        # calculate metrics for the full eval_df
+        self._evaluate_builtin_metrics(eval_df)
+        self._update_metrics()
+        self._evaluate_extra_metrics(eval_df)
+
     def _evaluate_builtin_metrics(self, eval_df):
-        if not self.builtin_metrics:
-            return
         for builtin_metric in self.builtin_metrics:
             _logger.info(f"Evaluating builtin metrics: {builtin_metric.name}")
 
@@ -1442,12 +1552,33 @@ def _log_eval_table(self):
         metric_prefix = self.evaluator_config.get("metric_prefix", "")
         if not isinstance(metric_prefix, str):
             metric_prefix = ""
-        if self.dataset.has_targets:
-            data = self.dataset.features_data.assign(
-                **{self.dataset.targets_name or "target": self.y, "outputs": self.y_pred}
-            )
+        if isinstance(self.dataset.features_data, pd.DataFrame):
+            # Handle DataFrame case
+            if self.dataset.has_targets:
+                data = self.dataset.features_data.assign(
+                    **{
+                        self.dataset.targets_name or "target": self.y,
+                        self.dataset.predictions_name or "outputs": self.y_pred,
+                    }
+                )
+            else:
+                data = self.dataset.features_data.assign(outputs=self.y_pred)
         else:
-            data = self.dataset.features_data.assign(outputs=self.y_pred)
+            # Handle NumPy array case, converting it to a DataFrame
+            data = pd.DataFrame(self.dataset.features_data, columns=self.dataset.feature_names)
+            if self.dataset.has_targets:
+                data = data.assign(
+                    **{
+                        self.dataset.targets_name or "target": self.y,
+                        self.dataset.predictions_name or "outputs": self.y_pred,
+                    }
+                )
+            else:
+                data = data.assign(outputs=self.y_pred)
+
+        # Include other_output_columns in the eval table
+        if self.other_output_columns is not None:
+            data = data.assign(**self.other_output_columns)
 
         columns = {}
         for metric_name, metric_value in self.metrics_values.items():
@@ -1477,10 +1608,11 @@ def _update_metrics(self):
         for metric_name, metric_value in self.metrics_values.items():
             if metric_value.aggregate_results:
                 for agg_name, agg_value in metric_value.aggregate_results.items():
-                    if agg_name == metric_name.split("/")[0]:
-                        self.metrics[metric_name] = agg_value
-                    else:
-                        self.metrics[f"{metric_name}/{agg_name}"] = agg_value
+                    if agg_value is not None:
+                        if agg_name == metric_name.split("/")[0]:
+                            self.metrics[metric_name] = agg_value
+                        else:
+                            self.metrics[f"{metric_name}/{agg_name}"] = agg_value
 
     def _evaluate(
         self,
@@ -1506,36 +1638,50 @@ def _evaluate(
             self.artifacts = {}
             self.metrics = {}
             self.metrics_values = {}
-            self.builtin_metrics = {}
+            self.builtin_metrics = []
 
             text_metrics = [
-                token_count,
-                toxicity,
-                perplexity,
-                flesch_kincaid_grade_level,
-                ari_grade_level,
+                token_count(),
+                toxicity(),
+                flesch_kincaid_grade_level(),
+                ari_grade_level(),
             ]
 
             with mlflow.utils.autologging_utils.disable_autologging():
                 compute_latency = False
-                if self.extra_metrics:
-                    for extra_metric in self.extra_metrics:
-                        # If latency metric is specified, we will compute latency for the model
-                        # during prediction, and we will remove the metric from the list of extra
-                        # metrics to be computed after prediction.
-                        if extra_metric.name == _LATENCY_METRIC_NAME:
-                            compute_latency = True
-                            self.extra_metrics.remove(extra_metric)
-                            break
+                for extra_metric in self.extra_metrics:
+                    # If latency metric is specified, we will compute latency for the model
+                    # during prediction, and we will remove the metric from the list of extra
+                    # metrics to be computed after prediction.
+                    if extra_metric.name == _LATENCY_METRIC_NAME:
+                        compute_latency = True
+                        self.extra_metrics.remove(extra_metric)
+                        break
                 self._generate_model_predictions(compute_latency=compute_latency)
                 if self.model_type in (_ModelType.CLASSIFIER, _ModelType.REGRESSOR):
                     self._compute_builtin_metrics()
                 elif self.model_type == _ModelType.QUESTION_ANSWERING:
-                    self.builtin_metrics = [*text_metrics, exact_match]
+                    self.builtin_metrics = [*text_metrics, exact_match()]
                 elif self.model_type == _ModelType.TEXT_SUMMARIZATION:
-                    self.builtin_metrics = [*text_metrics, rouge1, rouge2, rougeL, rougeLsum]
+                    self.builtin_metrics = [
+                        *text_metrics,
+                        rouge1(),
+                        rouge2(),
+                        rougeL(),
+                        rougeLsum(),
+                    ]
                 elif self.model_type == _ModelType.TEXT:
                     self.builtin_metrics = text_metrics
+                elif self.model_type == _ModelType.RETRIEVER:
+                    k = self.evaluator_config.pop("k", 3)  # default k to 3 if not specified
+                    if not (isinstance(k, int) and k > 0):
+                        _logger.warning(
+                            "Cannot calculate 'precision_at_k' for invalid parameter 'k'."
+                            f"'k' should be a positive integer; found: {k}"
+                            "Skipping metric logging."
+                        )
+                    else:
+                        self.builtin_metrics = [precision_at_k(k)]
 
                 self.y_pred = (
                     self.y_pred.squeeze() if isinstance(self.y_pred, pd.DataFrame) else self.y_pred
@@ -1544,9 +1690,7 @@ def _evaluate(
                 if self.dataset.has_targets:
                     eval_df["target"] = self.y
 
-                self._evaluate_builtin_metrics(eval_df)
-                self._update_metrics()
-                self._evaluate_extra_metrics(eval_df)
+                self._evaluate_metrics(eval_df)
                 if not is_baseline_model:
                     self._log_custom_artifacts(eval_df)
 
@@ -1585,6 +1729,7 @@ def evaluate(
         extra_metrics=None,
         custom_artifacts=None,
         baseline_model=None,
+        predictions=None,
         **kwargs,
     ):
         self.dataset = dataset
@@ -1595,6 +1740,7 @@ def evaluate(
 
         self.custom_artifacts = custom_artifacts
         self.y = dataset.labels_data
+        self.predictions = predictions
         self.col_mapping = self.evaluator_config.get("col_mapping", {})
         self.pos_label = self.evaluator_config.get("pos_label")
         self.sample_weights = self.evaluator_config.get("sample_weights")
@@ -1615,6 +1761,24 @@ def evaluate(
         else:
             self.extra_metrics = extra_metrics
 
+        if self.extra_metrics is None:
+            self.extra_metrics = []
+
+        bad_metrics = []
+        for metric in self.extra_metrics:
+            if not isinstance(metric, EvaluationMetric):
+                bad_metrics.append(metric)
+        if len(bad_metrics) > 0:
+            message = "\n".join(
+                [f"- Metric '{m}' has type '{type(m).__name__}'" for m in bad_metrics]
+            )
+            raise MlflowException(
+                f"In the 'extra_metrics' parameter, the following metrics have the wrong type:\n"
+                f"{message}\n"
+                f"Please ensure that all extra metrics are instances of "
+                f"mlflow.metrics.EvaluationMetric."
+            )
+
         if self.model_type in (_ModelType.CLASSIFIER, _ModelType.REGRESSOR):
             inferred_model_type = _infer_model_type_by_labels(self.y)
             if inferred_model_type is not None and model_type != inferred_model_type:
diff --git a/mlflow/openai/__init__.py b/mlflow/openai/__init__.py
index 666679e512787..f8c1621e8800c 100644
--- a/mlflow/openai/__init__.py
+++ b/mlflow/openai/__init__.py
@@ -47,7 +47,7 @@
 from mlflow.models import Model, ModelInputExample, ModelSignature
 from mlflow.models.model import MLMODEL_FILE_NAME
 from mlflow.models.utils import _save_example
-from mlflow.openai.utils import _OAITokenHolder
+from mlflow.openai.utils import _OAITokenHolder, _validate_model_params
 from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
 from mlflow.tracking._model_registry import DEFAULT_AWAIT_MAX_SLEEP_SECONDS
 from mlflow.tracking.artifact_utils import _download_artifact_from_uri
@@ -358,6 +358,10 @@ def save_model(
         mlflow_model = Model()
 
     if signature is not None:
+        if signature.params:
+            _validate_model_params(
+                task, kwargs, {p.name: p.default for p in signature.params.params}
+            )
         mlflow_model.signature = signature
     elif task == "chat.completions":
         messages = kwargs.get("messages", [])
@@ -683,13 +687,14 @@ def get_params_list(self, data):
         else:
             return data[self.formater.variables].to_dict(orient="records")
 
-    def _predict_chat(self, data):
+    def _predict_chat(self, data, params):
         import openai
 
         from mlflow.openai.api_request_parallel_processor import process_api_requests
 
+        _validate_model_params(self.task, self.model, params)
         messages_list = self.format_completions(self.get_params_list(data))
-        requests = [{**self.model, "messages": messages} for messages in messages_list]
+        requests = [{**self.model, **params, "messages": messages} for messages in messages_list]
         results = process_api_requests(
             requests,
             openai.ChatCompletion,
@@ -699,18 +704,20 @@ def _predict_chat(self, data):
         )
         return [r["choices"][0]["message"]["content"] for r in results]
 
-    def _predict_completions(self, data):
+    def _predict_completions(self, data, params):
         import openai
 
         from mlflow.openai.api_request_parallel_processor import process_api_requests
 
+        _validate_model_params(self.task, self.model, params)
         prompts_list = self.format_completions(self.get_params_list(data))
 
-        batch_size = self.api_config.batch_size
+        batch_size = params.pop("batch_size", self.api_config.batch_size)
         _logger.debug(f"Requests are being batched by {batch_size} samples.")
         requests = [
             {
                 **self.model,
+                **params,
                 "prompt": prompts_list[i : i + batch_size],
             }
             for i in range(0, len(prompts_list), batch_size)
@@ -724,12 +731,13 @@ def _predict_completions(self, data):
         )
         return [row["text"] for batch in results for row in batch["choices"]]
 
-    def _predict_embeddings(self, data):
+    def _predict_embeddings(self, data, params):
         import openai
 
         from mlflow.openai.api_request_parallel_processor import process_api_requests
 
-        batch_size = self.api_config.batch_size
+        _validate_model_params(self.task, self.model, params)
+        batch_size = params.pop("batch_size", self.api_config.batch_size)
         _logger.debug(f"Requests are being batched by {batch_size} samples.")
 
         first_string_column = _first_string_column(data)
@@ -737,6 +745,7 @@ def _predict_embeddings(self, data):
         requests = [
             {
                 **self.model,
+                **params,
                 "input": texts[i : i + batch_size],
             }
             for i in range(0, len(texts), batch_size)
@@ -750,9 +759,7 @@ def _predict_embeddings(self, data):
         )
         return [row["embedding"] for batch in results for row in batch["data"]]
 
-    def predict(
-        self, data, params: Optional[Dict[str, Any]] = None  # pylint: disable=unused-argument
-    ):
+    def predict(self, data, params: Optional[Dict[str, Any]] = None):
         """
         :param data: Model input data.
         :param params: Additional parameters to pass to the model for inference.
@@ -765,11 +772,11 @@ def predict(
 
         self.api_token.validate()
         if self.task == "chat.completions":
-            return self._predict_chat(data)
+            return self._predict_chat(data, params or {})
         elif self.task == "completions":
-            return self._predict_completions(data)
+            return self._predict_completions(data, params or {})
         elif self.task == "embeddings":
-            return self._predict_embeddings(data)
+            return self._predict_embeddings(data, params or {})
 
 
 class _TestOpenAIWrapper(_OpenAIWrapper):
diff --git a/mlflow/openai/utils.py b/mlflow/openai/utils.py
index 6fdbd2ca3f80b..fdaef6d444085 100644
--- a/mlflow/openai/utils.py
+++ b/mlflow/openai/utils.py
@@ -135,6 +135,22 @@ def request(*args, **kwargs):
     return _mock_request(new=request)
 
 
+def _validate_model_params(task, model, params):
+    if not params:
+        return
+
+    if any(key in model for key in params):
+        raise mlflow.MlflowException.invalid_parameter_value(
+            f"Providing any of {list(model.keys())} as parameters in the signature is not "
+            "allowed because they were indicated as part of the OpenAI model. Either remove "
+            "the argument when logging the model or remove the parameter from the signature.",
+        )
+    if "batch_size" in params and task == "chat.completions":
+        raise mlflow.MlflowException.invalid_parameter_value(
+            "Parameter `batch_size` is not supported for task `chat.completions`"
+        )
+
+
 class _OAITokenHolder:
     def __init__(self, api_type):
         import openai
@@ -142,7 +158,13 @@ def __init__(self, api_type):
         self._api_token = None
         self._credential = None
         self._is_azure_ad = api_type in ("azure_ad", "azuread")
-        self._key_configured = bool(openai.api_key) or "OPENAI_API_KEY" in os.environ
+        self._key_configured = bool(openai.api_key)
+
+        # set the api key if it's not set. this is to deal with cases where the
+        # user sets the environment variable after importing the `openai` module
+        if not bool(openai.api_key) and "OPENAI_API_KEY" in os.environ:
+            openai.api_key = os.environ["OPENAI_API_KEY"]
+            self._key_configured = True
 
         if self._is_azure_ad and not self._key_configured:
             try:
diff --git a/mlflow/pyfunc/__init__.py b/mlflow/pyfunc/__init__.py
index 1c8ffd3102f6a..bf77a350f5a91 100644
--- a/mlflow/pyfunc/__init__.py
+++ b/mlflow/pyfunc/__init__.py
@@ -209,7 +209,6 @@
 
 import collections
 import functools
-import hashlib
 import importlib
 import inspect
 import logging
@@ -269,6 +268,7 @@
     check_port_connectivity,
     find_free_port,
     get_major_minor_py_version,
+    insecure_hash,
 )
 from mlflow.utils import env_manager as _EnvManager
 from mlflow.utils.annotations import deprecated, experimental
@@ -1473,6 +1473,8 @@ def _predict_row_batch(predict_fn, args):
         pandas.DataFrame if isinstance(result_type, SparkStructType) else pandas.Series
     )
 
+    tracking_uri = mlflow.get_tracking_uri()
+
     @pandas_udf(result_type)
     def udf(
         iterator: Iterator[Tuple[Union[pandas.Series, pandas.DataFrame], ...]]
@@ -1492,6 +1494,9 @@ def udf(
         if mlflow_testing:
             _MLFLOW_TESTING.set(mlflow_testing)
         scoring_server_proc = None
+        # set tracking_uri inside udf so that with spark_connect
+        # we can load the model from correct path
+        mlflow.set_tracking_uri(tracking_uri)
 
         if env_manager != _EnvManager.LOCAL:
             if should_use_spark_to_broadcast_file:
@@ -1578,7 +1583,7 @@ def batch_predict_fn(pdf, params=None):
                 model_path = os.path.join(
                     tempfile.gettempdir(),
                     "mlflow",
-                    hashlib.sha1(model_uri.encode()).hexdigest(),
+                    insecure_hash.sha1(model_uri.encode()).hexdigest(),
                 )
                 try:
                     loaded_model = mlflow.pyfunc.load_model(model_path)
diff --git a/mlflow/pytorch/__init__.py b/mlflow/pytorch/__init__.py
index bb24b7f680977..11df0adcce244 100644
--- a/mlflow/pytorch/__init__.py
+++ b/mlflow/pytorch/__init__.py
@@ -947,35 +947,28 @@ def autolog(
                                   The registered model is created if it does not already exist.
     :param extra_tags: A dictionary of extra tags to set on each managed run created by autologging.
 
-    .. code-block:: python
+    .. testcode:: python
         :caption: Example
 
         import os
 
-        import pytorch_lightning as pl
+        import lightning as L
         import torch
         from torch.nn import functional as F
-        from torch.utils.data import DataLoader
+        from torch.utils.data import DataLoader, Subset
+        from torchmetrics import Accuracy
         from torchvision import transforms
         from torchvision.datasets import MNIST
 
-        try:
-            from torchmetrics.functional import accuracy
-        except ImportError:
-            from pytorch_lightning.metrics.functional import accuracy
-
         import mlflow.pytorch
         from mlflow import MlflowClient
 
-        # For brevity, here is the simplest most minimal example with just a training
-        # loop step, (no validation, no testing). It illustrates how you can use MLflow
-        # to auto log parameters, metrics, and models.
 
-
-        class MNISTModel(pl.LightningModule):
+        class MNISTModel(L.LightningModule):
             def __init__(self):
                 super().__init__()
                 self.l1 = torch.nn.Linear(28 * 28, 10)
+                self.accuracy = Accuracy("multiclass", num_classes=10)
 
             def forward(self, x):
                 return torch.relu(self.l1(x.view(x.size(0), -1)))
@@ -985,9 +978,9 @@ def training_step(self, batch, batch_nb):
                 logits = self(x)
                 loss = F.cross_entropy(logits, y)
                 pred = logits.argmax(dim=1)
-                acc = accuracy(pred, y)
+                acc = self.accuracy(pred, y)
 
-                # Use the current of PyTorch logger
+                # PyTorch `self.log` will be automatically captured by MLflow.
                 self.log("train_loss", loss, on_epoch=True)
                 self.log("acc", acc, on_epoch=True)
                 return loss
@@ -1006,51 +999,30 @@ def print_auto_logged_info(r):
             print(f"tags: {tags}")
 
 
-        # Initialize our model
+        # Initialize our model.
         mnist_model = MNISTModel()
 
-        # Initialize DataLoader from MNIST Dataset
+        # Load MNIST dataset.
         train_ds = MNIST(
             os.getcwd(), train=True, download=True, transform=transforms.ToTensor()
         )
-        train_loader = DataLoader(train_ds, batch_size=32)
+        # Only take a subset of the data for faster training.
+        indices = torch.arange(32)
+        train_ds = Subset(train_ds, indices)
+        train_loader = DataLoader(train_ds, batch_size=8)
 
-        # Initialize a trainer
-        trainer = pl.Trainer(max_epochs=20, progress_bar_refresh_rate=20)
+        # Initialize a trainer.
+        trainer = L.Trainer(max_epochs=3)
 
         # Auto log all MLflow entities
         mlflow.pytorch.autolog()
 
-        # Train the model
+        # Train the model.
         with mlflow.start_run() as run:
             trainer.fit(mnist_model, train_loader)
 
-        # fetch the auto logged parameters and metrics
+        # Fetch the auto logged parameters and metrics.
         print_auto_logged_info(mlflow.get_run(run_id=run.info.run_id))
-
-    .. code-block:: text
-        :caption: Output
-
-        run_id: 42caa17b60cb489c8083900fb52506a7
-        artifacts: ['model/MLmodel', 'model/conda.yaml', 'model/data']
-        params: {'betas': '(0.9, 0.999)',
-                 'weight_decay': '0',
-                 'epochs': '20',
-                 'eps': '1e-08',
-                 'lr': '0.02',
-                 'optimizer_name': 'Adam', '
-                 amsgrad': 'False'}
-        metrics: {'acc_step': 0.0,
-                  'train_loss_epoch': 1.0917967557907104,
-                  'train_loss_step': 1.0794280767440796,
-                  'train_loss': 1.0794280767440796,
-                  'acc_epoch': 0.0033333334140479565,
-                  'acc': 0.0}
-        tags: {'Mode': 'training'}
-
-    .. figure:: ../_static/images/pytorch_lightening_autolog.png
-
-        PyTorch autologged MLflow entities
     """
     try:
         import pytorch_lightning as pl
diff --git a/mlflow/sagemaker/__init__.py b/mlflow/sagemaker/__init__.py
index 6a9a8be15b0b5..b647dc2405c1d 100644
--- a/mlflow/sagemaker/__init__.py
+++ b/mlflow/sagemaker/__init__.py
@@ -2396,8 +2396,8 @@ def update_deployment(
                          Defaults to ``None``.
 
                        - ``variant_name``: A string specifying the desired name when creating a
-                                           production variant.  Defaults to ``None``.                                           
-                       - ``async_inference_config``: A dictionary specifying the async config 
+                                           production variant.  Defaults to ``None``.
+                       - ``async_inference_config``: A dictionary specifying the async config
                                                      configuration. Defaults to ``None``.
                        - ``env``: A dictionary specifying environment variables as key-value pairs
                          to be set for the deployed model. Defaults to ``None``.
diff --git a/mlflow/server/__init__.py b/mlflow/server/__init__.py
index 663f98de557e8..43a3ceee20fd6 100644
--- a/mlflow/server/__init__.py
+++ b/mlflow/server/__init__.py
@@ -7,7 +7,6 @@
 import types
 
 from flask import Flask, Response, send_from_directory
-from flask import __version__ as flask_version
 from packaging.version import Version
 
 from mlflow.exceptions import MlflowException
@@ -40,7 +39,7 @@
 REL_STATIC_DIR = "js/build"
 
 app = Flask(__name__, static_folder=REL_STATIC_DIR)
-IS_FLASK_V1 = Version(flask_version) < Version("2.0")
+IS_FLASK_V1 = Version(importlib.metadata.version("flask")) < Version("2.0")
 
 
 for http_path, handler, methods in handlers.get_endpoints():
diff --git a/mlflow/spark/__init__.py b/mlflow/spark/__init__.py
index 199a6b13f1c4f..ddcb51608feb5 100644
--- a/mlflow/spark/__init__.py
+++ b/mlflow/spark/__init__.py
@@ -94,10 +94,8 @@ def get_default_pip_requirements(is_spark_connect_model=False):
 
     # Strip the suffix from `dev` versions of PySpark, which are not
     # available for installation from Anaconda or PyPI
-    pyspark_extras = ["connect"] if is_spark_connect_model else None
-    pyspark_req = re.sub(
-        r"(\.?)dev.*$", "", _get_pinned_requirement("pyspark", extras=pyspark_extras)
-    )
+    pyspark_req_str = "pyspark[connect]" if is_spark_connect_model else "pyspark"
+    pyspark_req = re.sub(r"(\.?)dev.*$", "", _get_pinned_requirement(pyspark_req_str))
     reqs = [pyspark_req]
     if Version(pyspark.__version__) < Version("3.4"):
         # Versions of PySpark < 3.4 are incompatible with pandas >= 2
diff --git a/mlflow/store/_unity_catalog/registry/rest_store.py b/mlflow/store/_unity_catalog/registry/rest_store.py
index aaa20074ba6c3..4bb4f16f8f12a 100644
--- a/mlflow/store/_unity_catalog/registry/rest_store.py
+++ b/mlflow/store/_unity_catalog/registry/rest_store.py
@@ -745,6 +745,19 @@ def get_model_version_by_alias(self, name, alias):
         response_proto = self._call_endpoint(GetModelVersionByAliasRequest, req_body)
         return model_version_from_uc_proto(response_proto.model_version)
 
+    def copy_model_version(self, src_mv, dst_name):
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_mv: A :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                       the source model version.
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the cloned model version.
+        """
+        return self._copy_model_version_impl(src_mv, dst_name)
+
     def _await_model_version_creation(self, mv, await_creation_for):
         """
         Does not wait for the model version to become READY as a successful creation will
diff --git a/mlflow/store/artifact/cloud_artifact_repo.py b/mlflow/store/artifact/cloud_artifact_repo.py
index dc04759563ede..0ddfdf57e18bf 100644
--- a/mlflow/store/artifact/cloud_artifact_repo.py
+++ b/mlflow/store/artifact/cloud_artifact_repo.py
@@ -210,21 +210,22 @@ def _parallelized_download_from_cloud(self, file_size, remote_file_path, local_p
                 env=parallel_download_subproc_env,
                 headers=self._extract_headers_from_credentials(cloud_credential_info.headers),
             )
-            if any(not e.retryable for e in failed_downloads.values()):
-                template = "===== Chunk {index} =====\n{error}"
-                failure = "\n".join(
-                    template.format(index=index, error=error)
-                    for index, error in failed_downloads.items()
-                )
-                raise MlflowException(f"Failed to download artifact {remote_file_path}:\n{failure}")
 
             if failed_downloads:
                 new_cloud_creds = self._get_read_credential_infos([remote_file_path])[0]
                 new_signed_uri = new_cloud_creds.signed_uri
                 new_headers = self._extract_headers_from_credentials(new_cloud_creds.headers)
-
-                for i in failed_downloads:
-                    download_chunk(i, _DOWNLOAD_CHUNK_SIZE, new_headers, local_path, new_signed_uri)
+                for chunk in failed_downloads:
+                    _logger.warning(
+                        f"Retrying download of chunk {chunk.index} of {remote_file_path}"
+                    )
+                    download_chunk(
+                        range_start=chunk.start,
+                        range_end=chunk.end,
+                        headers=new_headers,
+                        download_path=local_path,
+                        http_uri=new_signed_uri,
+                    )
 
     def _download_file(self, remote_file_path, local_path):
         # list_artifacts API only returns a list of FileInfos at the specified path
diff --git a/mlflow/store/artifact/databricks_models_artifact_repo.py b/mlflow/store/artifact/databricks_models_artifact_repo.py
index 8db9a5f892f94..7349fa9e999d0 100644
--- a/mlflow/store/artifact/databricks_models_artifact_repo.py
+++ b/mlflow/store/artifact/databricks_models_artifact_repo.py
@@ -157,23 +157,23 @@ def _parallelized_download_from_cloud(
                 env=parallel_download_subproc_env,
                 headers=headers,
             )
-            if any(not e.retryable for e in failed_downloads.values()):
-                template = "===== Chunk {index} =====\n{error}"
-                failure = "\n".join(
-                    template.format(index=index, error=error)
-                    for index, error in failed_downloads.items()
-                )
-                raise MlflowException(
-                    f"Failed to download artifact {dst_run_relative_artifact_path}:\n{failure}"
-                )
             if failed_downloads:
                 new_signed_uri, new_headers = self._get_signed_download_uri(
                     dst_run_relative_artifact_path
                 )
-            for i in failed_downloads:
-                download_chunk(
-                    i, _DOWNLOAD_CHUNK_SIZE, new_headers, dst_local_file_path, new_signed_uri
-                )
+                new_headers = self._extract_headers_from_signed_url(new_headers)
+                for chunk in failed_downloads:
+                    _logger.warning(
+                        f"Retrying download of chunk {chunk.index} of "
+                        f"{dst_run_relative_artifact_path}"
+                    )
+                    download_chunk(
+                        range_start=chunk.start,
+                        range_end=chunk.end,
+                        headers=new_headers,
+                        download_path=dst_local_file_path,
+                        http_uri=new_signed_uri,
+                    )
 
     def _download_file(self, remote_file_path, local_path):
         try:
diff --git a/mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py b/mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py
new file mode 100644
index 0000000000000..03a40f3259b12
--- /dev/null
+++ b/mlflow/store/db_migrations/versions/acf3f17fdcc7_add_storage_location_field_to_model_.py
@@ -0,0 +1,28 @@
+"""add storage location field to model versions
+
+Revision ID: acf3f17fdcc7
+Revises: 2d6e25af4d3e
+Create Date: 2023-10-23 15:26:53.062080
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from mlflow.store.model_registry.dbmodels.models import SqlModelVersion
+
+
+# revision identifiers, used by Alembic.
+revision = "acf3f17fdcc7"
+down_revision = "2d6e25af4d3e"
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    op.add_column(
+        SqlModelVersion.__tablename__,
+        sa.Column("storage_location", sa.String(500), nullable=True, default=None),
+    )
+
+
+def downgrade():
+    pass
diff --git a/mlflow/store/model_registry/abstract_store.py b/mlflow/store/model_registry/abstract_store.py
index 56870304ce3ca..e7fdcab21cb6c 100644
--- a/mlflow/store/model_registry/abstract_store.py
+++ b/mlflow/store/model_registry/abstract_store.py
@@ -2,8 +2,10 @@
 from abc import ABCMeta, abstractmethod
 from time import sleep, time
 
+from mlflow.entities.model_registry import ModelVersionTag
 from mlflow.entities.model_registry.model_version_status import ModelVersionStatus
 from mlflow.exceptions import MlflowException
+from mlflow.protos.databricks_pb2 import RESOURCE_ALREADY_EXISTS, ErrorCode
 from mlflow.utils.annotations import developer_stable
 
 _logger = logging.getLogger(__name__)
@@ -321,6 +323,39 @@ def get_model_version_by_alias(self, name, alias):
         """
         pass
 
+    def copy_model_version(self, src_mv, dst_name):
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_mv: A :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                       the source model version.
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the cloned model version.
+        """
+        raise MlflowException(
+            "Method 'copy_model_version' has not yet been implemented for the current model "
+            "registry backend. To request support for implementing this method with this backend, "
+            "please submit an issue on GitHub."
+        )
+
+    def _copy_model_version_impl(self, src_mv, dst_name):
+        try:
+            self.create_registered_model(dst_name)
+        except MlflowException as e:
+            if e.error_code != ErrorCode.Name(RESOURCE_ALREADY_EXISTS):
+                raise
+
+        return self.create_model_version(
+            name=dst_name,
+            source=f"models:/{src_mv.name}/{src_mv.version}",
+            run_id=src_mv.run_id,
+            tags=[ModelVersionTag(k, v) for k, v in src_mv.tags.items()],
+            run_link=src_mv.run_link,
+            description=src_mv.description,
+        )
+
     def _await_model_version_creation(self, mv, await_creation_for):
         """
         Await for model version to become ready after creation.
diff --git a/mlflow/store/model_registry/dbmodels/models.py b/mlflow/store/model_registry/dbmodels/models.py
index fbe1f5a57bc65..30039eceec30f 100644
--- a/mlflow/store/model_registry/dbmodels/models.py
+++ b/mlflow/store/model_registry/dbmodels/models.py
@@ -80,6 +80,8 @@ class SqlModelVersion(Base):
 
     source = Column(String(500), nullable=True, default=None)
 
+    storage_location = Column(String(500), nullable=True, default=None)
+
     run_id = Column(String(32), nullable=True, default=None)
 
     run_link = Column(String(500), nullable=True, default=None)
diff --git a/mlflow/store/model_registry/file_store.py b/mlflow/store/model_registry/file_store.py
index 0dad30cf3ebbc..513a8a2463048 100644
--- a/mlflow/store/model_registry/file_store.py
+++ b/mlflow/store/model_registry/file_store.py
@@ -3,7 +3,9 @@
 import shutil
 import sys
 import time
+import urllib
 from os.path import join
+from typing import List
 
 from mlflow.entities.model_registry import (
     ModelVersion,
@@ -27,6 +29,7 @@
     RESOURCE_ALREADY_EXISTS,
     RESOURCE_DOES_NOT_EXIST,
 )
+from mlflow.store.artifact.utils.models import _parse_model_uri
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.store.model_registry import (
     DEFAULT_LOCAL_FILE_AND_ARTIFACT_PATH,
@@ -78,6 +81,32 @@ def _validate_model_name(name):
         )
 
 
+class FileModelVersion(ModelVersion):
+    def __init__(self, storage_location=None, **kwargs):
+        super().__init__(**kwargs)
+        self._storage_location = storage_location
+
+    @property
+    def storage_location(self):
+        """String. The storage location of the model version."""
+        return self._storage_location
+
+    @storage_location.setter
+    def storage_location(self, location):
+        self._storage_location = location
+
+    @classmethod
+    def _properties(cls):
+        # aggregate with parent class with subclass properties
+        return sorted(ModelVersion._properties() + cls._get_properties_helper())
+
+    def to_mlflow_entity(self):
+        meta = dict(self)
+        return ModelVersion.from_dictionary(
+            {**meta, "tags": [ModelVersionTag(k, v) for k, v in meta["tags"].items()]}
+        )
+
+
 class FileStore(AbstractStore):
     MODELS_FOLDER_NAME = "models"
     META_DATA_FILE_NAME = "meta.yaml"
@@ -232,7 +261,7 @@ def rename_registered_model(self, name, new_name):
             self._save_registered_model_as_meta_file(
                 registered_model, meta_dir=new_meta_dir, overwrite=False
             )
-            model_versions = self._list_model_versions_under_path(model_path)
+            model_versions = self._list_file_model_versions_under_path(model_path)
             for mv in model_versions:
                 mv.name = new_name
                 mv.last_updated_timestamp = updated_time
@@ -347,7 +376,7 @@ def get_registered_model(self, name):
             )
         return self._get_registered_model_from_path(model_path)
 
-    def get_latest_versions(self, name, stages=None):
+    def get_latest_versions(self, name, stages=None) -> List[ModelVersion]:
         """
         Latest version models for each requested stage. If no ``stages`` argument is provided,
         returns the latest version for each stage.
@@ -363,7 +392,7 @@ def get_latest_versions(self, name, stages=None):
                 f"Registered Model with name={name} not found",
                 RESOURCE_DOES_NOT_EXIST,
             )
-        model_versions = self._list_model_versions_under_path(registered_model_path)
+        model_versions = self._list_file_model_versions_under_path(registered_model_path)
         if stages is None or len(stages) == 0:
             expected_stages = {get_canonical_stage(stage) for stage in ALL_STAGES}
         else:
@@ -375,7 +404,7 @@ def get_latest_versions(self, name, stages=None):
                     mv.current_stage not in latest_versions
                     or latest_versions[mv.current_stage].version < mv.version
                 ):
-                    latest_versions[mv.current_stage] = mv
+                    latest_versions[mv.current_stage] = mv.to_mlflow_entity()
 
         return [latest_versions[stage] for stage in expected_stages if stage in latest_versions]
 
@@ -474,12 +503,12 @@ def delete_registered_model_tag(self, name, key):
 
     # CRUD API for ModelVersion objects
 
-    def _get_registered_model_version_tag_from_file(self, parent_path, tag_name):
+    def _get_registered_model_version_tag_from_file(self, parent_path, tag_name) -> ModelVersionTag:
         _validate_tag_name(tag_name)
         tag_data = read_file(parent_path, tag_name)
         return ModelVersionTag(tag_name, tag_data)
 
-    def _get_model_version_tags_from_dir(self, directory):
+    def _get_model_version_tags_from_dir(self, directory) -> List[ModelVersionTag]:
         parent_path, tag_files = self._get_resource_files(directory, FileStore.TAGS_FOLDER_NAME)
         tags = []
         for tag_file in tag_files:
@@ -500,13 +529,15 @@ def _get_model_version_aliases(self, directory):
         version = os.path.basename(directory).replace("version-", "")
         return [alias.alias for alias in aliases if alias.version == version]
 
-    def _get_model_version_from_dir(self, directory):
+    def _get_file_model_version_from_dir(self, directory) -> FileModelVersion:
         meta = FileStore._read_yaml(directory, FileStore.META_DATA_FILE_NAME)
         meta["tags"] = self._get_model_version_tags_from_dir(directory)
         meta["aliases"] = self._get_model_version_aliases(directory)
-        return ModelVersion.from_dictionary(meta)
+        return FileModelVersion.from_dictionary(meta)
 
-    def _save_model_version_as_meta_file(self, model_version, meta_dir=None, overwrite=True):
+    def _save_model_version_as_meta_file(
+        self, model_version: FileModelVersion, meta_dir=None, overwrite=True
+    ):
         model_version_dict = dict(model_version)
         del model_version_dict["tags"]
         meta_dir = meta_dir or self._get_model_version_dir(
@@ -534,12 +565,13 @@ def create_model_version(
         run_link=None,
         description=None,
         local_model_path=None,
-    ):
+    ) -> ModelVersion:
         """
         Create a new model version from given source and run ID.
 
         :param name: Registered model name.
-        :param source: Source path where the MLflow model is stored.
+        :param source: Source path or model version URI (in the format
+                       ``models:/<model_name>/<version>``) where the MLflow model is stored.
         :param run_id: Run ID from MLflow tracking server that generated the model.
         :param tags: A list of :py:class:`mlflow.entities.model_registry.ModelVersionTag`
                      instances associated with this model version.
@@ -551,7 +583,7 @@ def create_model_version(
 
         def next_version(registered_model_name):
             path = self._get_registered_model_path(registered_model_name)
-            model_versions = self._list_model_versions_under_path(path)
+            model_versions = self._list_file_model_versions_under_path(path)
             if model_versions:
                 return max(mv.version for mv in model_versions) + 1
             else:
@@ -560,6 +592,18 @@ def next_version(registered_model_name):
         _validate_model_name(name)
         for tag in tags or []:
             _validate_model_version_tag(tag.key, tag.value)
+        storage_location = source
+        if urllib.parse.urlparse(source).scheme == "models":
+            parsed_model_uri = _parse_model_uri(source)
+            try:
+                storage_location = self.get_model_version_download_uri(
+                    parsed_model_uri.name, parsed_model_uri.version
+                )
+            except Exception as e:
+                raise MlflowException(
+                    f"Unable to fetch model from model URI source artifact location '{source}'."
+                    f"Error: {e}"
+                ) from e
         for attempt in range(self.CREATE_MODEL_VERSION_RETRIES):
             try:
                 creation_time = get_current_time_millis()
@@ -567,7 +611,7 @@ def next_version(registered_model_name):
                 registered_model.last_updated_timestamp = creation_time
                 self._save_registered_model_as_meta_file(registered_model)
                 version = next_version(name)
-                model_version = ModelVersion(
+                model_version = FileModelVersion(
                     name=name,
                     version=version,
                     creation_timestamp=creation_time,
@@ -579,6 +623,7 @@ def next_version(registered_model_name):
                     run_link=run_link,
                     tags=tags,
                     aliases=[],
+                    storage_location=storage_location,
                 )
                 model_version_dir = self._get_model_version_dir(name, version)
                 mkdir(model_version_dir)
@@ -589,7 +634,7 @@ def next_version(registered_model_name):
                 if tags is not None:
                     for tag in tags:
                         self.set_model_version_tag(name, version, tag)
-                return model_version
+                return model_version.to_mlflow_entity()
             except Exception as e:
                 more_retries = self.CREATE_MODEL_VERSION_RETRIES - attempt - 1
                 logging.warning(
@@ -604,7 +649,7 @@ def next_version(registered_model_name):
                         f"{self.CREATE_MODEL_VERSION_RETRIES} attempts."
                     )
 
-    def update_model_version(self, name, version, description):
+    def update_model_version(self, name, version, description) -> ModelVersion:
         """
         Update metadata associated with a model version in backend.
 
@@ -614,13 +659,15 @@ def update_model_version(self, name, version, description):
         :return: A single :py:class:`mlflow.entities.model_registry.ModelVersion` object.
         """
         updated_time = get_current_time_millis()
-        model_version = self.get_model_version(name=name, version=version)
+        model_version = self._fetch_file_model_version_if_exists(name=name, version=version)
         model_version.description = description
         model_version.last_updated_timestamp = updated_time
         self._save_model_version_as_meta_file(model_version)
-        return model_version
+        return model_version.to_mlflow_entity()
 
-    def transition_model_version_stage(self, name, version, stage, archive_existing_versions):
+    def transition_model_version_stage(
+        self, name, version, stage, archive_existing_versions
+    ) -> ModelVersion:
         """
         Update model version stage.
 
@@ -645,19 +692,19 @@ def transition_model_version_stage(self, name, version, stage, archive_existing_
         model_versions = []
         if archive_existing_versions:
             registered_model_path = self._get_registered_model_path(name)
-            model_versions = self._list_model_versions_under_path(registered_model_path)
+            model_versions = self._list_file_model_versions_under_path(registered_model_path)
             for mv in model_versions:
                 if mv.version != version and mv.current_stage == get_canonical_stage(stage):
                     mv.current_stage = STAGE_ARCHIVED
                     mv.last_updated_timestamp = last_updated_time
                     self._save_model_version_as_meta_file(mv)
 
-        model_version = self.get_model_version(name, version)
+        model_version = self._fetch_file_model_version_if_exists(name, version)
         model_version.current_stage = get_canonical_stage(stage)
         model_version.last_updated_timestamp = last_updated_time
         self._save_model_version_as_meta_file(model_version)
         self._update_registered_model_last_updated_time(name, last_updated_time)
-        return model_version
+        return model_version.to_mlflow_entity()
 
     def delete_model_version(self, name, version):
         """
@@ -667,7 +714,7 @@ def delete_model_version(self, name, version):
         :param version: Registered model version.
         :return: None
         """
-        model_version = self.get_model_version(name=name, version=version)
+        model_version = self._fetch_file_model_version_if_exists(name=name, version=version)
         model_version.current_stage = STAGE_DELETED_INTERNAL
         updated_time = get_current_time_millis()
         model_version.last_updated_timestamp = updated_time
@@ -676,14 +723,16 @@ def delete_model_version(self, name, version):
         for alias in model_version.aliases:
             self.delete_registered_model_alias(name, alias)
 
-    def _fetch_model_version_if_exists(self, name, version):
+    def _fetch_file_model_version_if_exists(self, name, version) -> FileModelVersion:
+        _validate_model_name(name)
+        _validate_model_version(version)
         registered_model_version_dir = self._get_model_version_dir(name, version)
         if not exists(registered_model_version_dir):
             raise MlflowException(
                 f"Model Version (name={name}, version={version}) not found",
                 RESOURCE_DOES_NOT_EXIST,
             )
-        model_version = self._get_model_version_from_dir(registered_model_version_dir)
+        model_version = self._get_file_model_version_from_dir(registered_model_version_dir)
         if model_version.current_stage == STAGE_DELETED_INTERNAL:
             raise MlflowException(
                 f"Model Version (name={name}, version={version}) not found",
@@ -691,7 +740,7 @@ def _fetch_model_version_if_exists(self, name, version):
             )
         return model_version
 
-    def get_model_version(self, name, version):
+    def get_model_version(self, name, version) -> ModelVersion:
         """
         Get the model version instance by name and version.
 
@@ -699,11 +748,9 @@ def get_model_version(self, name, version):
         :param version: Registered model version.
         :return: A single :py:class:`mlflow.entities.model_registry.ModelVersion` object.
         """
-        _validate_model_name(name)
-        _validate_model_version(version)
-        return self._fetch_model_version_if_exists(name, version)
+        return self._fetch_file_model_version_if_exists(name, version).to_mlflow_entity()
 
-    def get_model_version_download_uri(self, name, version):
+    def get_model_version_download_uri(self, name, version) -> str:
         """
         Get the download location in Model Registry for this model version.
         NOTE: For first version of Model Registry, since the models are not copied over to another
@@ -713,14 +760,14 @@ def get_model_version_download_uri(self, name, version):
         :param version: Registered model version.
         :return: A single URI location that allows reads for downloading.
         """
-        model_version = self.get_model_version(name, version)
-        return model_version.source
+        model_version = self._fetch_file_model_version_if_exists(name, version)
+        return model_version.storage_location or model_version.source
 
     def _get_all_registered_model_paths(self):
         self._check_root_dir()
         return list_subdirs(join(self.root_directory, FileStore.MODELS_FOLDER_NAME), full_path=True)
 
-    def _list_model_versions_under_path(self, path):
+    def _list_file_model_versions_under_path(self, path) -> List[FileModelVersion]:
         model_versions = []
         model_version_dirs = list_all(
             path,
@@ -729,12 +776,12 @@ def _list_model_versions_under_path(self, path):
             full_path=True,
         )
         for directory in model_version_dirs:
-            model_versions.append(self._get_model_version_from_dir(directory))
+            model_versions.append(self._get_file_model_version_from_dir(directory))
         return model_versions
 
     def search_model_versions(
         self, filter_string=None, max_results=None, order_by=None, page_token=None
-    ):
+    ) -> List[ModelVersion]:
         """
         Search for model versions in backend that satisfy the filter criteria.
 
@@ -767,7 +814,10 @@ def search_model_versions(
         registered_model_paths = self._get_all_registered_model_paths()
         model_versions = []
         for path in registered_model_paths:
-            model_versions.extend(self._list_model_versions_under_path(path))
+            model_versions.extend(
+                file_mv.to_mlflow_entity()
+                for file_mv in self._list_file_model_versions_under_path(path)
+            )
         filtered_mvs = SearchModelVersionUtils.filter(model_versions, filter_string)
 
         sorted_mvs = SearchModelVersionUtils.sort(
@@ -784,10 +834,8 @@ def search_model_versions(
         return PagedList(paginated_mvs, next_page_token)
 
     def _get_registered_model_version_tag_path(self, name, version, tag_name):
-        _validate_model_name(name)
-        _validate_model_version(version)
         _validate_tag_name(tag_name)
-        self._fetch_model_version_if_exists(name, version)
+        self._fetch_file_model_version_if_exists(name, version)
         registered_model_version_path = self._get_model_version_dir(name, version)
         return os.path.join(registered_model_version_path, FileStore.TAGS_FOLDER_NAME, tag_name)
 
@@ -845,7 +893,7 @@ def set_registered_model_alias(self, name, alias, version):
         :return: None
         """
         alias_path = self._get_registered_model_alias_path(name, alias)
-        self._fetch_model_version_if_exists(name, version)
+        self._fetch_file_model_version_if_exists(name, version)
         make_containing_dirs(alias_path)
         write_to(alias_path, self._writeable_value(version))
         updated_time = get_current_time_millis()
@@ -865,7 +913,7 @@ def delete_registered_model_alias(self, name, alias):
             updated_time = get_current_time_millis()
             self._update_registered_model_last_updated_time(name, updated_time)
 
-    def get_model_version_by_alias(self, name, alias):
+    def get_model_version_by_alias(self, name, alias) -> ModelVersion:
         """
         Get the model version instance by name and alias.
 
@@ -905,6 +953,19 @@ def _read_helper(root, file_name, attempts_remaining=2):
 
         return _read_helper(root, file_name, attempts_remaining=retries)
 
+    def copy_model_version(self, src_mv, dst_name) -> ModelVersion:
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_mv: A :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                       the source model version.
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the cloned model version.
+        """
+        return self._copy_model_version_impl(src_mv, dst_name)
+
     def _await_model_version_creation(self, mv, await_creation_for):
         """
         Does not wait for the model version to become READY as a successful creation will
diff --git a/mlflow/store/model_registry/sqlalchemy_store.py b/mlflow/store/model_registry/sqlalchemy_store.py
index a4a65e552f7a0..cabef5c65e674 100644
--- a/mlflow/store/model_registry/sqlalchemy_store.py
+++ b/mlflow/store/model_registry/sqlalchemy_store.py
@@ -1,9 +1,11 @@
 import logging
+import urllib
 
 import sqlalchemy
 from sqlalchemy.future import select
 
 import mlflow.store.db.utils
+from mlflow.entities.model_registry import ModelVersion
 from mlflow.entities.model_registry.model_version_stages import (
     ALL_STAGES,
     DEFAULT_STAGES_FOR_GET_LATEST_VERSIONS,
@@ -18,6 +20,7 @@
     RESOURCE_ALREADY_EXISTS,
     RESOURCE_DOES_NOT_EXIST,
 )
+from mlflow.store.artifact.utils.models import _parse_model_uri
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.store.model_registry import (
     SEARCH_MODEL_VERSION_MAX_RESULTS_DEFAULT,
@@ -612,7 +615,8 @@ def create_model_version(
         Create a new model version from given source and run ID.
 
         :param name: Registered model name.
-        :param source: Source path where the MLflow model is stored.
+        :param source: Source path or model version URI (in the format
+                       ``models:/<model_name>/<version>``) where the MLflow model is stored.
         :param run_id: Run ID from MLflow tracking server that generated the model.
         :param tags: A list of :py:class:`mlflow.entities.model_registry.ModelVersionTag`
                      instances associated with this model version.
@@ -631,6 +635,18 @@ def next_version(sql_registered_model):
         _validate_model_name(name)
         for tag in tags or []:
             _validate_model_version_tag(tag.key, tag.value)
+        storage_location = source
+        if urllib.parse.urlparse(source).scheme == "models":
+            parsed_model_uri = _parse_model_uri(source)
+            try:
+                storage_location = self.get_model_version_download_uri(
+                    parsed_model_uri.name, parsed_model_uri.version
+                )
+            except Exception as e:
+                raise MlflowException(
+                    f"Unable to fetch model from model URI source artifact location '{source}'."
+                    f"Error: {e}"
+                ) from e
         with self.ManagedSessionMaker() as session:
             creation_time = get_current_time_millis()
             for attempt in range(self.CREATE_MODEL_VERSION_RETRIES):
@@ -644,6 +660,7 @@ def next_version(sql_registered_model):
                         creation_time=creation_time,
                         last_updated_time=creation_time,
                         source=source,
+                        storage_location=storage_location,
                         run_id=run_id,
                         run_link=run_link,
                         description=description,
@@ -856,7 +873,7 @@ def get_model_version_download_uri(self, name, version):
         """
         with self.ManagedSessionMaker() as session:
             sql_model_version = self._get_sql_model_version(session, name, version)
-            return sql_model_version.source
+            return sql_model_version.storage_location or sql_model_version.source
 
     def search_model_versions(
         self,
@@ -1099,6 +1116,19 @@ def get_model_version_by_alias(self, name, alias):
                     f"Registered model alias {alias} not found.", INVALID_PARAMETER_VALUE
                 )
 
+    def copy_model_version(self, src_mv, dst_name) -> ModelVersion:
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_mv: A :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                       the source model version.
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the cloned model version.
+        """
+        return self._copy_model_version_impl(src_mv, dst_name)
+
     def _await_model_version_creation(self, mv, await_creation_for):
         """
         Does not wait for the model version to become READY as a successful creation will
diff --git a/mlflow/store/tracking/abstract_store.py b/mlflow/store/tracking/abstract_store.py
index 69de91f8c7538..172aeeacb0cca 100644
--- a/mlflow/store/tracking/abstract_store.py
+++ b/mlflow/store/tracking/abstract_store.py
@@ -5,6 +5,8 @@
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.store.tracking import SEARCH_MAX_RESULTS_DEFAULT
 from mlflow.utils.annotations import developer_stable, experimental
+from mlflow.utils.async_logging.async_logging_queue import AsyncLoggingQueue
+from mlflow.utils.async_logging.run_operations import RunOperations
 
 
 @developer_stable
@@ -21,7 +23,7 @@ def __init__(self):
         Empty constructor for now. This is deliberately not marked as abstract, else every
         derived class would be forced to create one.
         """
-        pass
+        self._async_logging_queue = AsyncLoggingQueue(logging_func=self.log_batch)
 
     @abstractmethod
     def search_experiments(
@@ -219,6 +221,15 @@ def log_metric(self, run_id, metric):
         """
         self.log_batch(run_id, metrics=[metric], params=[], tags=[])
 
+    def log_metric_async(self, run_id, metric) -> RunOperations:
+        """
+        Log a metric for the specified run in async fashion.
+
+        :param run_id: String id for the run
+        :param metric: :py:class:`mlflow.entities.Metric` instance to log
+        """
+        return self.log_batch_async(run_id, metrics=[metric], params=[], tags=[])
+
     def log_param(self, run_id, param):
         """
         Log a param for the specified run
@@ -228,6 +239,15 @@ def log_param(self, run_id, param):
         """
         self.log_batch(run_id, metrics=[], params=[param], tags=[])
 
+    def log_param_async(self, run_id, param) -> RunOperations:
+        """
+        Log a param for the specified run in async fashion.
+
+        :param run_id: String id for the run
+        :param param: :py:class:`mlflow.entities.Param` instance to log
+        """
+        return self.log_batch_async(run_id, metrics=[], params=[param], tags=[])
+
     def set_experiment_tag(self, experiment_id, tag):
         """
         Set a tag for the specified experiment
@@ -246,6 +266,15 @@ def set_tag(self, run_id, tag):
         """
         self.log_batch(run_id, metrics=[], params=[], tags=[tag])
 
+    def set_tag_async(self, run_id, tag) -> RunOperations:
+        """
+        Set a tag for the specified run in async fashion.
+
+        :param run_id: String id for the run
+        :param tag: :py:class:`mlflow.entities.RunTag` instance to set
+        """
+        return self.log_batch_async(run_id, metrics=[], params=[], tags=[tag])
+
     @abstractmethod
     def get_metric_history(self, run_id, metric_key, max_results=None, page_token=None):
         """
@@ -296,13 +325,24 @@ def search_runs(
             meaningful in such cases.
         """
         runs, token = self._search_runs(
-            experiment_ids, filter_string, run_view_type, max_results, order_by, page_token
+            experiment_ids,
+            filter_string,
+            run_view_type,
+            max_results,
+            order_by,
+            page_token,
         )
         return PagedList(runs, token)
 
     @abstractmethod
     def _search_runs(
-        self, experiment_ids, filter_string, run_view_type, max_results, order_by, page_token
+        self,
+        experiment_ids,
+        filter_string,
+        run_view_type,
+        max_results,
+        order_by,
+        page_token,
     ):
         """
         Return runs that match the given list of search expressions within the experiments, as
@@ -332,6 +372,26 @@ def log_batch(self, run_id, metrics, params, tags):
         """
         pass
 
+    def log_batch_async(self, run_id, metrics, params, tags) -> RunOperations:
+        """
+        Log multiple metrics, params, and tags for the specified run in async fashion.
+        This API does not offer immediate consistency of the data. When API returns,
+        data is accepted but not persisted/processed by back end. Data would be processed
+        in near real time fashion.
+
+        :param run_id: String id for the run
+        :param metrics: List of :py:class:`mlflow.entities.Metric` instances to log
+        :param params: List of :py:class:`mlflow.entities.Param` instances to log
+        :param tags: List of :py:class:`mlflow.entities.RunTag` instances to log
+        :return: None.
+        """
+        if not self._async_logging_queue.is_active():
+            self._async_logging_queue.activate()
+
+        return self._async_logging_queue.log_batch_async(
+            run_id=run_id, metrics=metrics, params=params, tags=tags
+        )
+
     @abstractmethod
     def record_logged_model(self, run_id, mlflow_model):
         """
diff --git a/mlflow/store/tracking/file_store.py b/mlflow/store/tracking/file_store.py
index 2b1019aa7dbf1..a04f557879f1d 100644
--- a/mlflow/store/tracking/file_store.py
+++ b/mlflow/store/tracking/file_store.py
@@ -1,4 +1,3 @@
-import hashlib
 import json
 import logging
 import os
@@ -46,7 +45,7 @@
     SEARCH_MAX_RESULTS_THRESHOLD,
 )
 from mlflow.store.tracking.abstract_store import AbstractStore
-from mlflow.utils import get_results_from_paginated_fn
+from mlflow.utils import get_results_from_paginated_fn, insecure_hash
 from mlflow.utils.file_utils import (
     append_to,
     exists,
@@ -1128,13 +1127,13 @@ def log_inputs(self, run_id: str, datasets: Optional[List[DatasetInput]] = None)
 
     @staticmethod
     def _get_dataset_id(dataset_name: str, dataset_digest: str) -> str:
-        md5 = hashlib.md5(dataset_name.encode("utf-8"))
+        md5 = insecure_hash.md5(dataset_name.encode("utf-8"))
         md5.update(dataset_digest.encode("utf-8"))
         return md5.hexdigest()
 
     @staticmethod
     def _get_input_id(dataset_id: str, run_id: str) -> str:
-        md5 = hashlib.md5(dataset_id.encode("utf-8"))
+        md5 = insecure_hash.md5(dataset_id.encode("utf-8"))
         md5.update(run_id.encode("utf-8"))
         return md5.hexdigest()
 
diff --git a/mlflow/tracking/_model_registry/client.py b/mlflow/tracking/_model_registry/client.py
index 2968e7c1e324e..2db189dadc169 100644
--- a/mlflow/tracking/_model_registry/client.py
+++ b/mlflow/tracking/_model_registry/client.py
@@ -205,6 +205,19 @@ def create_model_version(
             self.store._await_model_version_creation(mv, await_creation_for)
         return mv
 
+    def copy_model_version(self, src_mv, dst_name):
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_mv: A :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                       the source model version.
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the cloned model version.
+        """
+        return self.store.copy_model_version(src_mv=src_mv, dst_name=dst_name)
+
     def update_model_version(self, name, version, description):
         """
         Update metadata associated with a model version in backend.
@@ -274,6 +287,10 @@ def search_model_versions(
         """
         Search for model versions in backend that satisfy the filter criteria.
 
+        .. warning:
+
+            The model version search results may not have aliases populated for performance reasons.
+
         :param filter_string: A filter string expression. Currently supports a single filter
                               condition either name of model like ``name = 'model_name'`` or
                               ``run_id = '...'``.
diff --git a/mlflow/tracking/_model_registry/fluent.py b/mlflow/tracking/_model_registry/fluent.py
index d23a2b04d4b81..1c47a96afb3b3 100644
--- a/mlflow/tracking/_model_registry/fluent.py
+++ b/mlflow/tracking/_model_registry/fluent.py
@@ -234,6 +234,10 @@ def search_model_versions(
     """
     Search for model versions that satisfy the filter criteria.
 
+    .. warning:
+
+        The model version search results may not have aliases populated for performance reasons.
+
     :param filter_string: Filter query string
         (e.g., ``"name = 'a_model_name' and tag.key = 'value1'"``),
         defaults to searching for all model versions. The following identifiers, comparators,
diff --git a/mlflow/tracking/_tracking_service/client.py b/mlflow/tracking/_tracking_service/client.py
index eb374d101c4c0..da43ee35a39da 100644
--- a/mlflow/tracking/_tracking_service/client.py
+++ b/mlflow/tracking/_tracking_service/client.py
@@ -18,6 +18,7 @@
 from mlflow.tracking._tracking_service import utils
 from mlflow.tracking.metric_value_conversion_utils import convert_metric_value_to_float_if_possible
 from mlflow.utils import chunk_list
+from mlflow.utils.async_logging.run_operations import RunOperations, get_combined_run_operations
 from mlflow.utils.mlflow_tags import MLFLOW_USER
 from mlflow.utils.string_utils import is_string_type
 from mlflow.utils.time import get_current_time_millis
@@ -261,7 +262,9 @@ def rename_experiment(self, experiment_id, new_name):
         """
         self.store.rename_experiment(experiment_id, new_name)
 
-    def log_metric(self, run_id, key, value, timestamp=None, step=None):
+    def log_metric(
+        self, run_id, key, value, timestamp=None, step=None, synchronous=True
+    ) -> Optional[RunOperations]:
         """
         Log a metric against the run ID.
 
@@ -278,21 +281,47 @@ def log_metric(self, run_id, key, value, timestamp=None, step=None):
                       may support larger values.
         :param timestamp: Time when this metric was calculated. Defaults to the current system time.
         :param step: Training step (iteration) at which was the metric calculated. Defaults to 0.
+        :param synchronous: *Experimental* If True, blocks until the metrics is logged
+                            successfully. If False, logs the metrics asynchronously and
+                            returns a future representing the logging operation.
+
+        :return: When synchronous=True, returns None.
+                 When synchronous=False, returns :py:class:`mlflow.RunOperations` that represents
+                 future for logging operation.
+
         """
         timestamp = timestamp if timestamp is not None else get_current_time_millis()
         step = step if step is not None else 0
         metric_value = convert_metric_value_to_float_if_possible(value)
         metric = Metric(key, metric_value, timestamp, step)
-        self.store.log_metric(run_id, metric)
+        if synchronous:
+            self.store.log_metric(run_id, metric)
+        else:
+            return self.store.log_metric_async(run_id, metric)
 
-    def log_param(self, run_id, key, value):
+    def log_param(self, run_id, key, value, synchronous=True):
         """
         Log a parameter (e.g. model hyperparameter) against the run ID. Value is converted to
         a string.
+
+        :param run_id: ID of the run to log the parameter against.
+        :param key: Name of the parameter.
+        :param value: Value of the parameter.
+        :param synchronous: *Experimental* If True, blocks until the parameters are logged
+                            successfully. If False, logs the parameters asynchronously and
+                            returns a future representing the logging operation.
+
+        :return: When synchronous=True, returns parameter value.
+                 When synchronous=False, returns :py:class:`mlflow.RunOperations` that
+                 represents future for logging operation.
         """
         param = Param(key, str(value))
         try:
-            self.store.log_param(run_id, param)
+            if synchronous:
+                self.store.log_param(run_id, param)
+                return value
+            else:
+                return self.store.log_param_async(run_id, param)
         except MlflowException as e:
             if e.error_code == ErrorCode.Name(INVALID_PARAMETER_VALUE):
                 msg = f"{e.message}{PARAM_VALIDATION_MSG}"
@@ -311,7 +340,7 @@ def set_experiment_tag(self, experiment_id, key, value):
         tag = ExperimentTag(key, str(value))
         self.store.set_experiment_tag(experiment_id, tag)
 
-    def set_tag(self, run_id, key, value):
+    def set_tag(self, run_id, key, value, synchronous=True) -> Optional[RunOperations]:
         """
         Set a tag on the run with the specified ID. Value is converted to a string.
 
@@ -323,9 +352,20 @@ def set_tag(self, run_id, key, value):
         :param value: Tag value (string, but will be string-ified if not).
                       All backend stores will support values up to length 5000, but some
                       may support larger values.
+        :param synchronous: *Experimental* If True, blocks until the tag is logged
+                            successfully. If False, logs the tag asynchronously and
+                            returns a future representing the logging operation.
+
+        :return: When synchronous=True, returns None.
+                 When synchronous=False, returns :py:class:`mlflow.RunOperations` object
+                 that represents future for logging operation.
+
         """
         tag = RunTag(key, str(value))
-        self.store.set_tag(run_id, tag)
+        if synchronous:
+            self.store.set_tag(run_id, tag)
+        else:
+            return self.store.set_tag_async(run_id, tag)
 
     def delete_tag(self, run_id, key):
         """
@@ -359,7 +399,9 @@ def update_run(self, run_id, status=None, name=None):
             run_name=name,
         )
 
-    def log_batch(self, run_id, metrics=(), params=(), tags=()):
+    def log_batch(
+        self, run_id, metrics=(), params=(), tags=(), synchronous=True
+    ) -> Optional[RunOperations]:
         """
         Log multiple metrics, params, and/or tags.
 
@@ -367,9 +409,15 @@ def log_batch(self, run_id, metrics=(), params=(), tags=()):
         :param metrics: If provided, List of Metric(key, value, timestamp) instances.
         :param params: If provided, List of Param(key, value) instances.
         :param tags: If provided, List of RunTag(key, value) instances.
+        :param synchronous: *Experimental* If True, blocks until the metrics/tags/params are logged
+                            successfully. If False, logs the metrics/tags/params asynchronously
+                            and returns a future representing the logging operation.
 
         Raises an MlflowException if any errors occur.
-        :return: None
+
+        :return: When synchronous=True, returns None.
+                 When synchronous=False, returns :py:class:`mlflow.RunOperations` that
+                 represents future for logging operation.
         """
         if len(metrics) == 0 and len(params) == 0 and len(tags) == 0:
             return
@@ -377,6 +425,12 @@ def log_batch(self, run_id, metrics=(), params=(), tags=()):
         param_batches = chunk_list(params, MAX_PARAMS_TAGS_PER_BATCH)
         tag_batches = chunk_list(tags, MAX_PARAMS_TAGS_PER_BATCH)
 
+        # When given data is split into one or more batches, we need to wait for all the batches.
+        # Each batch logged returns run_operations which we append to this list
+        # At the end we merge all the run_operations into a single run_operations object and return.
+        # Applicable only when synchronous is False
+        run_operations_list = []
+
         for params_batch, tags_batch in zip_longest(param_batches, tag_batches, fillvalue=[]):
             metrics_batch_size = min(
                 MAX_ENTITIES_PER_BATCH - len(params_batch) - len(tags_batch),
@@ -386,12 +440,33 @@ def log_batch(self, run_id, metrics=(), params=(), tags=()):
             metrics_batch = metrics[:metrics_batch_size]
             metrics = metrics[metrics_batch_size:]
 
-            self.store.log_batch(
-                run_id=run_id, metrics=metrics_batch, params=params_batch, tags=tags_batch
-            )
+            if synchronous:
+                self.store.log_batch(
+                    run_id=run_id, metrics=metrics_batch, params=params_batch, tags=tags_batch
+                )
+            else:
+                run_operations_list.append(
+                    self.store.log_batch_async(
+                        run_id=run_id,
+                        metrics=metrics_batch,
+                        params=params_batch,
+                        tags=tags_batch,
+                    )
+                )
 
         for metrics_batch in chunk_list(metrics, chunk_size=MAX_METRICS_PER_BATCH):
-            self.store.log_batch(run_id=run_id, metrics=metrics_batch, params=[], tags=[])
+            if synchronous:
+                self.store.log_batch(run_id=run_id, metrics=metrics_batch, params=[], tags=[])
+            else:
+                run_operations_list.append(
+                    self.store.log_batch_async(
+                        run_id=run_id, metrics=metrics_batch, params=[], tags=[]
+                    )
+                )
+
+        if not synchronous:
+            # Merge all the run operations into a single run operations object
+            return get_combined_run_operations(run_operations_list)
 
     def log_inputs(self, run_id: str, datasets: Optional[List[DatasetInput]] = None):
         """
diff --git a/mlflow/tracking/client.py b/mlflow/tracking/client.py
index c278bbc916618..f8eb1e5ee6aac 100644
--- a/mlflow/tracking/client.py
+++ b/mlflow/tracking/client.py
@@ -10,6 +10,7 @@
 import posixpath
 import sys
 import tempfile
+import urllib
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union
 
 import yaml
@@ -20,6 +21,9 @@
 from mlflow.entities.model_registry.model_version_stages import ALL_STAGES
 from mlflow.exceptions import MlflowException
 from mlflow.protos.databricks_pb2 import FEATURE_DISABLED, RESOURCE_DOES_NOT_EXIST
+from mlflow.store.artifact.utils.models import (
+    get_model_name_and_version,
+)
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.store.model_registry import (
     SEARCH_MODEL_VERSION_MAX_RESULTS_DEFAULT,
@@ -34,6 +38,7 @@
 from mlflow.tracking.artifact_utils import _upload_artifacts_to_databricks
 from mlflow.tracking.registry import UnsupportedModelRegistryStoreURIException
 from mlflow.utils.annotations import experimental
+from mlflow.utils.async_logging.run_operations import RunOperations
 from mlflow.utils.databricks_utils import get_databricks_run_url
 from mlflow.utils.logging_utils import eprint
 from mlflow.utils.mlflow_tags import (
@@ -689,7 +694,8 @@ def log_metric(
         value: float,
         timestamp: Optional[int] = None,
         step: Optional[int] = None,
-    ) -> None:
+        synchronous: bool = True,
+    ) -> Optional[RunOperations]:
         """
         Log a metric against the run ID.
 
@@ -706,6 +712,13 @@ def log_metric(
         :param timestamp: Time when this metric was calculated. Defaults to the current system time.
         :param step: Integer training step (iteration) at which was the metric calculated.
                      Defaults to 0.
+        :param synchronous: *Experimental* If True, blocks until the metric is logged successfully.
+                            If False, logs the metric asynchronously and returns a future
+                            representing the logging operation.
+
+        :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
         .. code-block:: python
             :caption: Example
@@ -736,6 +749,9 @@ def print_run_info(r):
             run = client.get_run(run.info.run_id)
             print_run_info(run)
 
+            # To log metric in async fashion
+            client.log_metric(run.info.run_id, "m", 1.5, synchronous=False)
+
         .. code-block:: text
             :caption: Output
 
@@ -747,9 +763,13 @@ def print_run_info(r):
             metrics: {'m': 1.5}
             status: FINISHED
         """
-        self._tracking_client.log_metric(run_id, key, value, timestamp, step)
+        return self._tracking_client.log_metric(
+            run_id, key, value, timestamp, step, synchronous=synchronous
+        )
 
-    def log_param(self, run_id: str, key: str, value: Any) -> Any:
+    def log_param(
+        self, run_id: str, key: str, value: Any, synchronous: Optional[bool] = True
+    ) -> Any:
         """
         Log a parameter (e.g. model hyperparameter) against the run ID.
 
@@ -761,7 +781,13 @@ def log_param(self, run_id: str, key: str, value: Any) -> Any:
         :param value: Parameter value (string, but will be string-ified if not).
                       All built-in backend stores support values up to length 6000, but some
                       may support larger values.
-        :return: the parameter value that is logged.
+        :param synchronous: *Experimental* If True, blocks until the parameter is logged
+                            successfully. If False, logs the parameter asynchronously and
+                            returns a future representing the logging operation.
+
+        :return: When `synchronous=True`, returns parameter value. When `synchronous=False`,
+                 returns an :py:class:`mlflow.utils.async_logging.run_operations.RunOperations`
+                 instance that represents future for logging operation.
 
         .. code-block:: python
             :caption: Example
@@ -804,8 +830,11 @@ def print_run_info(r):
             params: {'p': '1'}
             status: FINISHED
         """
-        self._tracking_client.log_param(run_id, key, value)
-        return value
+        if synchronous:
+            self._tracking_client.log_param(run_id, key, value, synchronous=True)
+            return value
+        else:
+            return self._tracking_client.log_param(run_id, key, value, synchronous=False)
 
     def set_experiment_tag(self, experiment_id: str, key: str, value: Any) -> None:
         """
@@ -838,7 +867,9 @@ def set_experiment_tag(self, experiment_id: str, key: str, value: Any) -> None:
         """
         self._tracking_client.set_experiment_tag(experiment_id, key, value)
 
-    def set_tag(self, run_id: str, key: str, value: Any) -> None:
+    def set_tag(
+        self, run_id: str, key: str, value: Any, synchronous: bool = True
+    ) -> Optional[RunOperations]:
         """
         Set a tag on the run with the specified ID. Value is converted to a string.
 
@@ -850,6 +881,13 @@ def set_tag(self, run_id: str, key: str, value: Any) -> None:
         :param value: Tag value (string, but will be string-ified if not).
                       All backend stores will support values up to length 5000, but some
                       may support larger values.
+        :param synchronous: *Experimental* If True, blocks until the tag is logged successfully.
+                            If False, logs the tag asynchronously and returns a future
+                            representing the logging operation.
+
+        :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
         .. code-block:: python
             :caption: Example
@@ -883,7 +921,7 @@ def print_run_info(run):
             run_id: 4f226eb5758145e9b28f78514b59a03b
             Tags: {'nlp.framework': 'Spark NLP'}
         """
-        self._tracking_client.set_tag(run_id, key, value)
+        return self._tracking_client.set_tag(run_id, key, value, synchronous=synchronous)
 
     def delete_tag(self, run_id: str, key: str) -> None:
         """
@@ -982,7 +1020,8 @@ def log_batch(
         metrics: Sequence[Metric] = (),
         params: Sequence[Param] = (),
         tags: Sequence[RunTag] = (),
-    ) -> None:
+        synchronous: bool = True,
+    ) -> Optional[RunOperations]:
         """
         Log multiple metrics, params, and/or tags.
 
@@ -990,9 +1029,15 @@ def log_batch(
         :param metrics: If provided, List of Metric(key, value, timestamp) instances.
         :param params: If provided, List of Param(key, value) instances.
         :param tags: If provided, List of RunTag(key, value) instances.
+        :param synchronous: *Experimental* If True, blocks until the metrics/tags/params are logged
+                            successfully. If False, logs the metrics/tags/params asynchronously
+                            and returns a future representing the logging operation.
 
         Raises an MlflowException if any errors occur.
-        :return: None
+
+        :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
         .. code-block:: python
             :caption: Example
@@ -1026,6 +1071,9 @@ def print_run_info(r):
             run = client.get_run(run.info.run_id)
             print_run_info(run)
 
+            # To log metric in async fashion
+            client.log_metric(run.info.run_id, "m", 1.5, synchronous=False)
+
         .. code-block:: text
             :caption: Output
 
@@ -1035,7 +1083,9 @@ def print_run_info(r):
             tags: {'t': 't'}
             status: FINISHED
         """
-        self._tracking_client.log_batch(run_id, metrics, params, tags)
+        return self._tracking_client.log_batch(
+            run_id, metrics, params, tags, synchronous=synchronous
+        )
 
     @experimental
     def log_inputs(
@@ -1669,7 +1719,9 @@ def get_artifact_data(run):
             if artifact_file in artifacts:
                 with tempfile.TemporaryDirectory() as tmpdir:
                     downloaded_artifact_path = mlflow.artifacts.download_artifacts(
-                        run_id=run_id, artifact_path=artifact_file, dst_path=tmpdir
+                        run_id=run_id,
+                        artifact_path=artifact_file,
+                        dst_path=tmpdir,
                     )
                     existing_predictions = pd.read_json(downloaded_artifact_path, orient="split")
                     if extra_columns is not None:
@@ -2667,6 +2719,29 @@ def create_model_version(
             await_creation_for=await_creation_for,
         )
 
+    def copy_model_version(self, src_model_uri, dst_name) -> ModelVersion:
+        """
+        Copy a model version from one registered model to another as a new model version.
+
+        :param src_model_uri: the model URI of the model version to copy. This must be a model
+                              registry URI with a `"models:/"` scheme (e.g.,
+                              `"models:/iris_model@champion"`).
+        :param dst_name: the name of the registered model to copy the model version to. If a
+                         registered model with this name does not exist, it will be created.
+        :return: Single :py:class:`mlflow.entities.model_registry.ModelVersion` object representing
+                 the copied model version.
+        """
+        if urllib.parse.urlparse(src_model_uri).scheme != "models":
+            raise MlflowException(
+                f"Unsupported source model URI: '{src_model_uri}'. The `copy_model_version` API "
+                "only copies models stored in the 'models:/' scheme."
+            )
+        client = self._get_registry_client()
+        src_name, src_version = get_model_name_and_version(client, src_model_uri)
+        src_mv = client.get_model_version(src_name, src_version)
+
+        return client.copy_model_version(src_mv=src_mv, dst_name=dst_name)
+
     def update_model_version(
         self, name: str, version: str, description: Optional[str] = None
     ) -> ModelVersion:
@@ -3018,6 +3093,10 @@ def search_model_versions(
         """
         Search for model versions in backend that satisfy the filter criteria.
 
+        .. warning:
+
+            The model version search results may not have aliases populated for performance reasons.
+
         :param filter_string: Filter query string
             (e.g., ``"name = 'a_model_name' and tag.key = 'value1'"``),
             defaults to searching for all model versions. The following identifiers, comparators,
@@ -3087,8 +3166,8 @@ def search_model_versions(
         )
 
     def get_model_version_stages(
-        self, name: str, version: str  # pylint: disable=unused-argument
-    ) -> List[str]:
+        self, name: str, version: str
+    ) -> List[str]:  # pylint: disable=unused-argument
         """
         :return: A list of valid stages.
 
diff --git a/mlflow/tracking/fluent.py b/mlflow/tracking/fluent.py
index cafccf6e6bc94..af21aa0163a13 100644
--- a/mlflow/tracking/fluent.py
+++ b/mlflow/tracking/fluent.py
@@ -42,6 +42,7 @@
 from mlflow.tracking.default_experiment import registry as default_experiment_registry
 from mlflow.utils import get_results_from_paginated_fn
 from mlflow.utils.annotations import experimental
+from mlflow.utils.async_logging.run_operations import RunOperations
 from mlflow.utils.autologging_utils import (
     AUTOLOGGING_CONF_KEY_IS_GLOBALLY_CONFIGURED,
     AUTOLOGGING_INTEGRATIONS,
@@ -70,6 +71,7 @@
     import PIL
     import plotly
 
+
 _active_run_stack = []
 run_id_to_system_metrics_monitor = {}
 _active_experiment_id = None
@@ -596,7 +598,7 @@ def get_parent_run(run_id: str) -> Optional[Run]:
     return MlflowClient().get_parent_run(run_id)
 
 
-def log_param(key: str, value: Any) -> Any:
+def log_param(key: str, value: Any, synchronous: bool = True) -> Any:
     """
     Log a parameter (e.g. model hyperparameter) under the current run. If no run is active,
     this method will create a new active run.
@@ -608,8 +610,13 @@ def log_param(key: str, value: Any) -> Any:
     :param value: Parameter value (string, but will be string-ified if not).
                   All built-in backend stores support values up to length 6000, but some
                   may support larger values.
+    :param synchronous: *Experimental* If True, blocks until the parameter is logged
+                        successfully. If False, logs the parameter asynchronously and
+                        returns a future representing the logging operation.
 
-    :return: the parameter value that is logged.
+    :return: When `synchronous=True`, returns parameter value. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
     .. testcode:: python
         :caption: Example
@@ -619,9 +626,10 @@ def log_param(key: str, value: Any) -> Any:
         with mlflow.start_run():
             value = mlflow.log_param("learning_rate", 0.01)
             assert value == 0.01
+            value = mlflow.log_param("learning_rate", 0.02, synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
-    return MlflowClient().log_param(run_id, key, value)
+    return MlflowClient().log_param(run_id, key, value, synchronous=synchronous)
 
 
 def set_experiment_tag(key: str, value: Any) -> None:
@@ -648,7 +656,7 @@ def set_experiment_tag(key: str, value: Any) -> None:
     MlflowClient().set_experiment_tag(experiment_id, key, value)
 
 
-def set_tag(key: str, value: Any) -> None:
+def set_tag(key: str, value: Any, synchronous: bool = True) -> Optional[RunOperations]:
     """
     Set a tag under the current run. If no run is active, this method will create a
     new active run.
@@ -660,17 +668,29 @@ def set_tag(key: str, value: Any) -> None:
     :param value: Tag value (string, but will be string-ified if not).
                   All backend stores will support values up to length 5000, but some
                   may support larger values.
+    :param synchronous: *Experimental* If True, blocks until the tag is logged
+                        successfully. If False, logs the tag asynchronously and
+                        returns a future representing the logging operation.
+
+    :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
     .. testcode:: python
         :caption: Example
 
         import mlflow
 
+        # Set a tag.
         with mlflow.start_run():
             mlflow.set_tag("release.version", "2.2.0")
+
+        # Set a tag in async fashion.
+        with mlflow.start_run():
+            mlflow.set_tag("release.version", "2.2.1", synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
-    MlflowClient().set_tag(run_id, key, value)
+    return MlflowClient().set_tag(run_id, key, value, synchronous=synchronous)
 
 
 def delete_tag(key: str) -> None:
@@ -697,7 +717,9 @@ def delete_tag(key: str) -> None:
     MlflowClient().delete_tag(run_id, key)
 
 
-def log_metric(key: str, value: float, step: Optional[int] = None) -> None:
+def log_metric(
+    key: str, value: float, step: Optional[int] = None, synchronous: bool = True
+) -> Optional[RunOperations]:
     """
     Log a metric under the current run. If no run is active, this method will create
     a new active run.
@@ -712,20 +734,36 @@ def log_metric(key: str, value: float, step: Optional[int] = None) -> None:
                   All backend stores will support values up to length 5000, but some
                   may support larger values.
     :param step: Metric step (int). Defaults to zero if unspecified.
+    :param synchronous: *Experimental* If True, blocks until the parameter is logged
+                        successfully. If False, logs the parameter asynchronously and
+                        returns a future representing the logging operation.
+
+    :return: When `synchronous=True`, returns None.
+             When `synchronous=False`, returns `RunOperations` that represents future for
+             logging operation.
 
     .. testcode:: python
         :caption: Example
 
         import mlflow
 
+        # Log a metric
         with mlflow.start_run():
             mlflow.log_metric("mse", 2500.00)
+
+        # Log a metric in async fashion.
+        with mlflow.start_run():
+            mlflow.log_metric("mse", 2500.00, synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
-    MlflowClient().log_metric(run_id, key, value, get_current_time_millis(), step or 0)
+    return MlflowClient().log_metric(
+        run_id, key, value, get_current_time_millis(), step or 0, synchronous=synchronous
+    )
 
 
-def log_metrics(metrics: Dict[str, float], step: Optional[int] = None) -> None:
+def log_metrics(
+    metrics: Dict[str, float], step: Optional[int] = None, synchronous: bool = True
+) -> Optional[RunOperations]:
     """
     Log multiple metrics for the current run. If no run is active, this method will create a new
     active run.
@@ -737,7 +775,13 @@ def log_metrics(metrics: Dict[str, float], step: Optional[int] = None) -> None:
     :param step: A single integer step at which to log the specified
                  Metrics. If unspecified, each metric is logged at step zero.
 
-    :returns: None
+    :param synchronous: *Experimental* If True, blocks until the metrics is logged
+                        successfully. If False, logs the metrics asynchronously and
+                        returns a future representing the logging operation.
+
+    :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
     .. testcode:: python
         :caption: Example
@@ -749,21 +793,33 @@ def log_metrics(metrics: Dict[str, float], step: Optional[int] = None) -> None:
         # Log a batch of metrics
         with mlflow.start_run():
             mlflow.log_metrics(metrics)
+
+        # Log a batch of metrics in async fashion.
+        with mlflow.start_run():
+            mlflow.log_metrics(metrics, synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
     timestamp = get_current_time_millis()
     metrics_arr = [Metric(key, value, timestamp, step or 0) for key, value in metrics.items()]
-    MlflowClient().log_batch(run_id=run_id, metrics=metrics_arr, params=[], tags=[])
+    return MlflowClient().log_batch(
+        run_id=run_id, metrics=metrics_arr, params=[], tags=[], synchronous=synchronous
+    )
 
 
-def log_params(params: Dict[str, Any]) -> None:
+def log_params(params: Dict[str, Any], synchronous: bool = True) -> Optional[RunOperations]:
     """
     Log a batch of params for the current run. If no run is active, this method will create a
     new active run.
 
     :param params: Dictionary of param_name: String -> value: (String, but will be string-ified if
                    not)
-    :returns: None
+    :param synchronous: *Experimental* If True, blocks until the parameters are logged
+                        successfully. If False, logs the parameters asynchronously and
+                        returns a future representing the logging operation.
+
+    :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
     .. testcode:: python
         :caption: Example
@@ -775,10 +831,16 @@ def log_params(params: Dict[str, Any]) -> None:
         # Log a batch of parameters
         with mlflow.start_run():
             mlflow.log_params(params)
+
+        # Log a batch of parameters in async fashion.
+        with mlflow.start_run():
+            mlflow.log_params(params, synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
     params_arr = [Param(key, str(value)) for key, value in params.items()]
-    MlflowClient().log_batch(run_id=run_id, metrics=[], params=params_arr, tags=[])
+    return MlflowClient().log_batch(
+        run_id=run_id, metrics=[], params=params_arr, tags=[], synchronous=synchronous
+    )
 
 
 @experimental
@@ -844,14 +906,20 @@ def set_experiment_tags(tags: Dict[str, Any]) -> None:
         set_experiment_tag(key, value)
 
 
-def set_tags(tags: Dict[str, Any]) -> None:
+def set_tags(tags: Dict[str, Any], synchronous: bool = True) -> Optional[RunOperations]:
     """
     Log a batch of tags for the current run. If no run is active, this method will create a
     new active run.
 
     :param tags: Dictionary of tag_name: String -> value: (String, but will be string-ified if
                  not)
-    :returns: None
+    :param synchronous: *Experimental* If True, blocks until the tag is logged
+                        successfully. If False, logs the tag asynchronously and
+                        returns a future representing the logging operation.
+
+    :return: When `synchronous=True`, returns None. When `synchronous=False`, returns an
+             :py:class:`mlflow.utils.async_logging.run_operations.RunOperations` instance that
+             represents future for logging operation.
 
     .. testcode:: python
         :caption: Example
@@ -867,10 +935,16 @@ def set_tags(tags: Dict[str, Any]) -> None:
         # Set a batch of tags
         with mlflow.start_run():
             mlflow.set_tags(tags)
+
+        # Set a batch of tags in async fashion.
+        with mlflow.start_run():
+            mlflow.set_tags(tags, synchronous=False)
     """
     run_id = _get_or_start_run().info.run_id
     tags_arr = [RunTag(key, str(value)) for key, value in tags.items()]
-    MlflowClient().log_batch(run_id=run_id, metrics=[], params=[], tags=tags_arr)
+    return MlflowClient().log_batch(
+        run_id=run_id, metrics=[], params=[], tags=tags_arr, synchronous=synchronous
+    )
 
 
 def log_artifact(local_path: str, artifact_path: Optional[str] = None) -> None:
diff --git a/mlflow/transformers/__init__.py b/mlflow/transformers/__init__.py
index 3433a65ad5d9c..31f567cc75317 100644
--- a/mlflow/transformers/__init__.py
+++ b/mlflow/transformers/__init__.py
@@ -426,6 +426,20 @@ def save_model(
     else:
         built_pipeline = transformers_model
 
+    # Verify that the model has not been loaded to distributed memory
+    # NB: transformers does not correctly save a model whose weights have been loaded
+    # using accelerate iff the model weights have been loaded using a device_map that is
+    # heterogeneous. There is a distinct possibility for a partial write to occur, causing an
+    # invalid state of the model's weights in this scenario. Hence, we raise.
+    if _is_model_distributed_in_memory(built_pipeline.model):
+        raise MlflowException(
+            "The model that is attempting to be saved has been loaded into memory "
+            "with an incompatible configuration. If you are using the accelerate "
+            "library to load your model, please ensure that it is saved only after "
+            "loading with the default device mapping. Do not specify `device_map` "
+            "and please try again."
+        )
+
     if mlflow_model is None:
         mlflow_model = Model()
     if signature is not None:
@@ -852,6 +866,18 @@ def load_model(
     return _load_model(local_model_path, flavor_config, return_type, device, **kwargs)
 
 
+def _is_model_distributed_in_memory(transformers_model):
+    """Check if the model is distributed across multiple devices in memory."""
+
+    # Check if the model attribute exists. If not, accelerate was not used and the model can
+    # be safely saved
+    if not hasattr(transformers_model, "hf_device_map"):
+        return False
+    # If the device map has more than one unique value entry, then the weights are not within
+    # a contiguous memory system (VRAM, SYS, or DISK) and thus cannot be safely saved.
+    return len(set(transformers_model.hf_device_map.values())) > 1
+
+
 # This function attempts to determine if a GPU is available for the PyTorch and TensorFlow libraries
 def is_gpu_available():
     # try pytorch and if it fails, try tf
@@ -2316,8 +2342,16 @@ def _parse_list_of_multiple_dicts(output_data, target_dict_key):
         Returns the first value of the `target_dict_key` that matches in the first dictionary in a
         list of dictionaries.
         """
+
+        def fetch_target_key_value(data, key):
+            if isinstance(data[0], dict):
+                return data[0][key]
+            return [item[0][key] for item in data]
+
         if isinstance(output_data[0], list):
-            return [collection[0][target_dict_key] for collection in output_data]
+            return [
+                fetch_target_key_value(collection, target_dict_key) for collection in output_data
+            ]
         else:
             return [output_data[0][target_dict_key]]
 
diff --git a/mlflow/utils/_capture_modules.py b/mlflow/utils/_capture_modules.py
index f0e80e0c223b7..c649e50e17add 100644
--- a/mlflow/utils/_capture_modules.py
+++ b/mlflow/utils/_capture_modules.py
@@ -14,7 +14,10 @@
 from mlflow.pyfunc import MAIN
 from mlflow.utils._spark_utils import _prepare_subprocess_environ_for_creating_local_spark_session
 from mlflow.utils.file_utils import write_to
-from mlflow.utils.requirements_utils import DATABRICKS_MODULES_TO_PACKAGES
+from mlflow.utils.requirements_utils import (
+    DATABRICKS_MODULES_TO_PACKAGES,
+    MLFLOW_MODULES_TO_PACKAGES,
+)
 
 
 def _get_top_level_module(full_module_name):
@@ -79,6 +82,12 @@ def _record_imported_module(self, full_module_name):
                     self.imported_modules.add(databricks_module)
                     return
 
+        # special casing for mlflow extras since they may not be required by default
+        if top_level_module == "mlflow":
+            if second_level_module in MLFLOW_MODULES_TO_PACKAGES:
+                self.imported_modules.add(second_level_module)
+                return
+
         self.imported_modules.add(top_level_module)
 
     def __enter__(self):
diff --git a/mlflow/utils/_spark_utils.py b/mlflow/utils/_spark_utils.py
index 239ce113f3763..80a2b417deedc 100644
--- a/mlflow/utils/_spark_utils.py
+++ b/mlflow/utils/_spark_utils.py
@@ -46,7 +46,7 @@ def _create_local_spark_session_for_recipes():
     _prepare_subprocess_environ_for_creating_local_spark_session()
     return (
         SparkSession.builder.master("local[*]")
-        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config(
             "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/mlflow/utils/async_logging/__init__.py b/mlflow/utils/async_logging/__init__.py
new file mode 100644
index 0000000000000..245b74b4eebc6
--- /dev/null
+++ b/mlflow/utils/async_logging/__init__.py
@@ -0,0 +1 @@
+from mlflow.utils.async_logging import run_operations  # noqa: F401
diff --git a/mlflow/utils/async_logging/async_logging_queue.py b/mlflow/utils/async_logging/async_logging_queue.py
new file mode 100644
index 0000000000000..77579b75db853
--- /dev/null
+++ b/mlflow/utils/async_logging/async_logging_queue.py
@@ -0,0 +1,230 @@
+"""
+Defines an AsyncLoggingQueue that provides async fashion logging of metrics/tags/params using
+queue based approach.
+"""
+
+import atexit
+import logging
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from queue import Empty, Queue
+
+from mlflow.entities.metric import Metric
+from mlflow.entities.param import Param
+from mlflow.entities.run_tag import RunTag
+from mlflow.utils.async_logging.run_batch import RunBatch
+from mlflow.utils.async_logging.run_operations import RunOperations
+
+_logger = logging.getLogger(__name__)
+
+
+class AsyncLoggingQueue:
+    """
+    This is a queue based run data processor that queues incoming batches and processes them using
+    single worker thread.
+    """
+
+    def __init__(self, logging_func: callable([str, [Metric], [Param], [RunTag]])) -> None:
+        """Initializes an AsyncLoggingQueue object.
+
+        Args:
+            logging_func: A callable function that takes in four arguments: a string
+                representing the run_id, a list of Metric objects,
+                a list of Param objects, and a list of RunTag objects.
+        """
+        self._queue = Queue()
+        self._lock = threading.RLock()
+        self._logging_func = logging_func
+
+        self._is_activated = False
+
+    def _at_exit_callback(self) -> None:
+        """Callback function to be executed when the program is exiting.
+
+        Stops the data processing thread and waits for the queue to be drained. Finally, shuts down
+        the thread pools used for data logging and batch processing status check.
+        """
+        try:
+            # Stop the data processing thread
+            self._stop_data_logging_thread_event.set()
+            # Waits till queue is drained.
+            self._run_data_logging_thread.result()
+            self._batch_logging_threadpool.shutdown(wait=False)
+            self._batch_status_check_threadpool.shutdown(wait=False)
+        except Exception as e:
+            _logger.error(f"Encountered error while trying to finish logging: {e}")
+
+    def _logging_loop(self) -> None:
+        """
+        Continuously logs run data until `self._continue_to_process_data` is set to False.
+        If an exception occurs during logging, a `MlflowException` is raised.
+        """
+        try:
+            while not self._stop_data_logging_thread_event.is_set():
+                self._log_run_data()
+        except Exception as e:
+            from mlflow.exceptions import MlflowException
+
+            raise MlflowException(f"Exception inside the run data logging thread: {e}")
+
+    def _log_run_data(self) -> None:
+        """Process the run data in the running runs queues.
+
+        For each run in the running runs queues, this method retrieves the next batch of run data
+        from the queue and processes it by calling the `_processing_func` method with the run ID,
+        metrics, parameters, and tags in the batch. If the batch is empty, it is skipped. After
+        processing the batch, the processed watermark is updated and the batch event is set.
+        If an exception occurs during processing, the exception is logged and the batch event is set
+        with the exception. If the queue is empty, it is ignored.
+
+        Returns: None
+        """
+        run_batch = None  # type: RunBatch
+        try:
+            run_batch = self._queue.get(timeout=1)
+        except Empty:
+            # Ignore empty queue exception
+            return
+        try:
+            self._logging_func(
+                run_id=run_batch.run_id,
+                metrics=run_batch.metrics,
+                params=run_batch.params,
+                tags=run_batch.tags,
+            )
+
+            # Signal the batch processing is done.
+            run_batch.completion_event.set()
+
+        except Exception as e:
+            _logger.error(f"Run Id {run_batch.run_id}: Failed to log run data: Exception: {e}")
+            run_batch.exception = e
+            run_batch.completion_event.set()
+
+    def _wait_for_batch(self, batch: RunBatch) -> None:
+        """Wait for the given batch to be processed by the logging thread.
+
+        Args:
+            batch: The batch to wait for.
+
+        Raises:
+            Exception: If an exception occurred while processing the batch.
+        """
+        batch.completion_event.wait()
+        if batch.exception:
+            raise batch.exception
+
+    def __getstate__(self):
+        """Return the state of the object for pickling.
+
+        This method is called by the `pickle` module when the object is being pickled. It returns a
+        dictionary containing the object's state, with non-picklable attributes removed.
+
+        Returns:
+            dict: A dictionary containing the object's state.
+        """
+        state = self.__dict__.copy()
+        del state["_queue"]
+        del state["_lock"]
+        del state["_is_activated"]
+
+        if "_run_data_logging_thread" in state:
+            del state["_run_data_logging_thread"]
+        if "_stop_data_logging_thread_event" in state:
+            del state["_stop_data_logging_thread_event"]
+        if "_batch_logging_threadpool" in state:
+            del state["_batch_logging_threadpool"]
+        if "_batch_status_check_threadpool" in state:
+            del state["_batch_status_check_threadpool"]
+        if "_run_data_logging_thread" in state:
+            del state["_run_data_logging_thread"]
+        if "_stop_data_logging_thread_event" in state:
+            del state["_stop_data_logging_thread_event"]
+
+        return state
+
+    def __setstate__(self, state):
+        """Set the state of the object from a given state dictionary.
+
+        It pops back the removed non-picklable attributes from `self.__getstate__()`.
+
+        Args:
+            state (dict): A dictionary containing the state of the object.
+
+        Returns:
+            None
+        """
+        self.__dict__.update(state)
+        self._queue = Queue()
+        self._lock = threading.RLock()
+        self._is_activated = False
+        self._batch_logging_threadpool = None
+        self._batch_status_check_threadpool = None
+        self._stop_data_logging_thread_event = None
+
+    def log_batch_async(
+        self, run_id: str, params: [Param], tags: [RunTag], metrics: [Metric]
+    ) -> RunOperations:
+        """Asynchronously logs a batch of run data (parameters, tags, and metrics).
+
+        Args:
+            run_id (str): The ID of the run to log data for.
+            params (list[mlflow.entities.Param]): A list of parameters to log for the run.
+            tags (list[mlflow.entities.RunTag]): A list of tags to log for the run.
+            metrics (list[mlflow.entities.Metric]): A list of metrics to log for the run.
+
+        Returns:
+            mlflow.utils.async_utils.RunOperations: An object that encapsulates the
+                asynchronous operation of logging the batch of run data.
+                The object contains a list of `concurrent.futures.Future` objects that can be used
+                to check the status of the operation and retrieve any exceptions
+                that occurred during the operation.
+        """
+        from mlflow import MlflowException
+
+        if not self._is_activated:
+            raise MlflowException("AsyncLoggingQueue is not activated.")
+        batch = RunBatch(
+            run_id=run_id,
+            params=params,
+            tags=tags,
+            metrics=metrics,
+            completion_event=threading.Event(),
+        )
+
+        self._queue.put(batch)
+
+        operation_future = self._batch_status_check_threadpool.submit(self._wait_for_batch, batch)
+        return RunOperations(operation_futures=[operation_future])
+
+    def is_active(self) -> bool:
+        return self._is_activated
+
+    def activate(self) -> None:
+        """Activates the async logging queue
+
+        1. Initializes queue draining thread.
+        2. Initializes threads for checking the status of logged batch.
+        3. Registering an atexit callback to ensure that any remaining log data
+        is flushed before the program exits.
+
+        If the queue is already activated, this method does nothing.
+        """
+        with self._lock:
+            if self._is_activated:
+                return
+
+            self._stop_data_logging_thread_event = threading.Event()
+
+            # Keeping max_workers=1 so that there are no two threads
+            self._batch_logging_threadpool = ThreadPoolExecutor(max_workers=1)
+
+            self._batch_status_check_threadpool = ThreadPoolExecutor(max_workers=10)
+
+            self._run_data_logging_thread = self._batch_logging_threadpool.submit(
+                self._logging_loop
+            )  # concurrent.futures.Future[self._logging_loop]
+
+            atexit.register(self._at_exit_callback)
+
+            self._is_activated = True
diff --git a/mlflow/utils/async_logging/run_batch.py b/mlflow/utils/async_logging/run_batch.py
new file mode 100644
index 0000000000000..de89ecf1104ae
--- /dev/null
+++ b/mlflow/utils/async_logging/run_batch.py
@@ -0,0 +1,33 @@
+import threading
+from typing import List
+
+from mlflow.entities.metric import Metric
+from mlflow.entities.param import Param
+from mlflow.entities.run_tag import RunTag
+
+
+class RunBatch:
+    def __init__(
+        self,
+        run_id: str,
+        params: List[Param],
+        tags: List[RunTag],
+        metrics: List[Metric],
+        completion_event: threading.Event,
+    ) -> None:
+        """
+        Initializes an instance of RunBatch.
+
+        Args:
+            run_id: The ID of the run.
+            params: A list of parameters.
+            tags: A list of tags.
+            metrics: A list of metrics.
+            completion_event: A threading.Event object.
+        """
+        self.run_id = run_id
+        self.params = params or []
+        self.tags = tags or []
+        self.metrics = metrics or []
+        self.completion_event = completion_event
+        self.exception = None
diff --git a/mlflow/utils/async_logging/run_operations.py b/mlflow/utils/async_logging/run_operations.py
new file mode 100644
index 0000000000000..90b92c3525821
--- /dev/null
+++ b/mlflow/utils/async_logging/run_operations.py
@@ -0,0 +1,54 @@
+class RunOperations:
+    """
+    Represents a collection of operations on one or more MLflow Runs, such as run creation
+    or metric logging.
+    """
+
+    def __init__(self, operation_futures):
+        self._operation_futures = operation_futures or []
+
+    def wait(self):
+        """
+        Blocks on completion of the MLflow Run operations.
+        """
+        from mlflow.exceptions import MlflowException
+
+        failed_operations = []
+        for future in self._operation_futures:
+            try:
+                future.result()
+            except Exception as e:
+                failed_operations.append(e)
+
+        if len(failed_operations) > 0:
+            # Importing MlflowException gives circular reference / module load error, need to
+            #  figure out why.
+            raise MlflowException(
+                "The following failures occurred while performing one or more logging"
+                + f" operations: {failed_operations}"
+            )
+
+
+def get_combined_run_operations(run_operations_list: [RunOperations]) -> RunOperations:
+    """
+    Given a list of RunOperations, returns a single RunOperations object that represents the
+    combined set of operations. If the input list is empty, returns None. If the input list
+    contains only one element, returns that element. Otherwise, creates a new RunOperations
+    object that combines the operation futures from each input RunOperations object.
+
+    :param run_operations_list: A list of RunOperations objects to combine.
+    :type run_operations_list: list[RunOperations]
+    :return: A single RunOperations object that represents the combined set of operations.
+    :rtype: RunOperations
+    """
+    if not run_operations_list:
+        return None
+    if len(run_operations_list) == 1:
+        return run_operations_list[0]
+
+    if len(run_operations_list) > 1:
+        operation_futures = []
+        for run_operations in run_operations_list:
+            if run_operations and run_operations._operation_futures:
+                operation_futures.extend(run_operations._operation_futures)
+        return RunOperations(operation_futures)
diff --git a/mlflow/utils/conda.py b/mlflow/utils/conda.py
index 522605acd1736..b22681a2ad56e 100644
--- a/mlflow/utils/conda.py
+++ b/mlflow/utils/conda.py
@@ -1,4 +1,3 @@
-import hashlib
 import json
 import logging
 import os
@@ -7,7 +6,7 @@
 
 from mlflow.environment_variables import MLFLOW_CONDA_CREATE_ENV_CMD, MLFLOW_CONDA_HOME
 from mlflow.exceptions import ExecutionException
-from mlflow.utils import process
+from mlflow.utils import insecure_hash, process
 from mlflow.utils.environment import Environment
 
 _logger = logging.getLogger(__name__)
@@ -61,12 +60,12 @@ def _get_conda_env_name(conda_env_path, env_id=None, env_root_dir=None):
     if env_id:
         conda_env_contents += env_id
 
-    env_name = "mlflow-%s" % hashlib.sha1(conda_env_contents.encode("utf-8")).hexdigest()
+    env_name = "mlflow-%s" % insecure_hash.sha1(conda_env_contents.encode("utf-8")).hexdigest()
     if env_root_dir:
         env_root_dir = os.path.normpath(env_root_dir)
         # Generate env name with format "mlflow-{conda_env_contents_hash}-{env_root_dir_hash}"
         # hashing `conda_env_contents` and `env_root_dir` separately helps debugging
-        env_name += "-%s" % hashlib.sha1(env_root_dir.encode("utf-8")).hexdigest()
+        env_name += "-%s" % insecure_hash.sha1(env_root_dir.encode("utf-8")).hexdigest()
 
     return env_name
 
diff --git a/mlflow/utils/credentials.py b/mlflow/utils/credentials.py
index 804f2740172a0..ff34c5db7e623 100644
--- a/mlflow/utils/credentials.py
+++ b/mlflow/utils/credentials.py
@@ -89,13 +89,14 @@ def _check_databricks_auth():
     try:
         w = WorkspaceClient()
         # If credentials are invalid, `clusters.list()` will throw an error.
-        w.current_user.me()
+        w.clusters.list()
         _logger.info(
             "Succesfully signed in Databricks! Please run `mlflow.set_tracking_uri('databricks')` "
             "to connect MLflow to Databricks tracking server."
         )
         return True
-    except Exception:
+    except Exception as e:
+        _logger.error(f"Failed to sign in Databricks: {e}")
         return False
 
 
diff --git a/mlflow/utils/docstring_utils.py b/mlflow/utils/docstring_utils.py
index bba712b1e5202..c2c41bbecf5ac 100644
--- a/mlflow/utils/docstring_utils.py
+++ b/mlflow/utils/docstring_utils.py
@@ -191,7 +191,7 @@ class that describes the model's inputs and outputs. If not specified but an
 based on the supplied input example and model. To disable automatic signature
 inference when providing an input example, set ``signature`` to ``False``.
 To manually infer a model signature, call
-:py:func:`infer_signature() <mlflow.models.infer_signature>` on datasets 
+:py:func:`infer_signature() <mlflow.models.infer_signature>` on datasets
 with valid model inputs, such as a training dataset with the target column
 omitted, and valid model outputs, like model predictions made on the training
 dataset, for example:
@@ -209,10 +209,10 @@ class that describes the model's inputs and outputs. If not specified but an
 as a hint of what data to feed the model. It will be converted to a Pandas
 DataFrame and then serialized to json using the Pandas split-oriented
 format, or a numpy array where the example will be serialized to json
-by converting it to a list. If input example is a tuple, then the first element 
-must be a valid model input, and the second element must be a valid params 
-dictionary that could be used for model inference. Bytes are base64-encoded. 
-When the ``signature`` parameter is ``None``, the input example is used to 
+by converting it to a list. If input example is a tuple, then the first element
+must be a valid model input, and the second element must be a valid params
+dictionary that could be used for model inference. Bytes are base64-encoded.
+When the ``signature`` parameter is ``None``, the input example is used to
 infer a model signature.
 """,
     }
diff --git a/mlflow/utils/download_cloud_file_chunk.py b/mlflow/utils/download_cloud_file_chunk.py
index 79ba106e6faeb..b2fd5d572e77e 100644
--- a/mlflow/utils/download_cloud_file_chunk.py
+++ b/mlflow/utils/download_cloud_file_chunk.py
@@ -7,8 +7,6 @@
 import os
 import sys
 
-from requests.exceptions import ChunkedEncodingError, ConnectionError, HTTPError
-
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -17,7 +15,6 @@ def parse_args():
     parser.add_argument("--headers", required=True, type=str)
     parser.add_argument("--download-path", required=True, type=str)
     parser.add_argument("--http-uri", required=True, type=str)
-    parser.add_argument("--temp-file", required=True, type=str)
     return parser.parse_args()
 
 
@@ -32,29 +29,13 @@ def main():
     download_chunk = module.download_chunk
 
     args = parse_args()
-
-    try:
-        download_chunk(
-            range_start=args.range_start,
-            range_end=args.range_end,
-            headers=json.loads(args.headers),
-            download_path=args.download_path,
-            http_uri=args.http_uri,
-        )
-    except (ConnectionError, ChunkedEncodingError):
-        with open(args.temp_file, "w") as f:
-            json.dump({"retryable": True}, f)
-        raise
-    except HTTPError as e:
-        with open(args.temp_file, "w") as f:
-            json.dump(
-                {
-                    "retryable": e.response.status_code in (401, 403, 408),
-                    "status_code": e.response.status_code,
-                },
-                f,
-            )
-        raise
+    download_chunk(
+        range_start=args.range_start,
+        range_end=args.range_end,
+        headers=json.loads(args.headers),
+        download_path=args.download_path,
+        http_uri=args.http_uri,
+    )
 
 
 if __name__ == "__main__":
diff --git a/mlflow/utils/environment.py b/mlflow/utils/environment.py
index d530add4ae2a6..0eb052cdf7d79 100644
--- a/mlflow/utils/environment.py
+++ b/mlflow/utils/environment.py
@@ -1,4 +1,3 @@
-import hashlib
 import logging
 import os
 import re
@@ -10,7 +9,7 @@
 
 from mlflow.exceptions import MlflowException
 from mlflow.protos.databricks_pb2 import INVALID_PARAMETER_VALUE
-from mlflow.utils import PYTHON_VERSION
+from mlflow.utils import PYTHON_VERSION, insecure_hash
 from mlflow.utils.process import _exec_cmd
 from mlflow.utils.requirements_utils import (
     _infer_requirements,
@@ -557,7 +556,7 @@ def _get_mlflow_env_name(s):
     :returns: String in the form of "mlflow-{hash}"
               (e.g. "mlflow-da39a3ee5e6b4b0d3255bfef95601890afd80709")
     """
-    return "mlflow-" + hashlib.sha1(s.encode("utf-8")).hexdigest()
+    return "mlflow-" + insecure_hash.sha1(s.encode("utf-8")).hexdigest()
 
 
 def _get_pip_install_mlflow():
diff --git a/mlflow/utils/file_utils.py b/mlflow/utils/file_utils.py
index f59e3f3033529..73c7c192147b1 100644
--- a/mlflow/utils/file_utils.py
+++ b/mlflow/utils/file_utils.py
@@ -19,8 +19,8 @@
 import uuid
 from concurrent.futures import as_completed
 from contextlib import contextmanager
+from dataclasses import dataclass
 from subprocess import CalledProcessError, TimeoutExpired
-from typing import Optional
 from urllib.parse import unquote
 from urllib.request import pathname2url
 
@@ -38,7 +38,7 @@
     MLFLOW_DOWNLOAD_CHUNK_TIMEOUT,
     MLFLOW_ENABLE_ARTIFACTS_PROGRESS_BAR,
 )
-from mlflow.exceptions import MissingConfigException
+from mlflow.exceptions import MissingConfigException, MlflowException
 from mlflow.protos.databricks_artifacts_pb2 import ArtifactCredentialType
 from mlflow.utils import download_cloud_file_chunk, merge_dicts
 from mlflow.utils.databricks_utils import _get_dbutils
@@ -661,16 +661,19 @@ def download_file_using_http_uri(http_uri, download_path, chunk_size=100000000,
                 output_file.write(chunk)
 
 
-class _ChunkDownloadError(Exception):
-    def __init__(self, retryable: bool, error: str, status_code: Optional[int] = None) -> None:
-        self.retryable = retryable
-        self.error = error
-        self.status_code = status_code
-        super().__init__(
-            f"Chunk download failed: {error}"
-            if status_code is None
-            else f"Chunk download failed with status code {status_code}: {error}"
-        )
+@dataclass(frozen=True)
+class _Chunk:
+    index: int
+    start: int
+    end: int
+
+
+def _yield_chunks(file_size, chunk_size):
+    num_requests = int(math.ceil(file_size / float(chunk_size)))
+    for i in range(num_requests):
+        range_start = i * chunk_size
+        range_end = min(range_start + chunk_size - 1, file_size - 1)
+        yield _Chunk(i, range_start, range_end)
 
 
 def parallelized_download_file_using_http_uri(
@@ -694,81 +697,54 @@ def parallelized_download_file_using_http_uri(
     Returns a dict of chunk index : exception, if one was thrown for that index.
     """
 
-    def run_download(range_start, range_end):
-        template = """
+    def run_download(chunk: _Chunk):
+        try:
+            subprocess.run(
+                [
+                    sys.executable,
+                    download_cloud_file_chunk.__file__,
+                    "--range-start",
+                    str(chunk.start),
+                    "--range-end",
+                    str(chunk.end),
+                    "--headers",
+                    json.dumps(headers or {}),
+                    "--download-path",
+                    download_path,
+                    "--http-uri",
+                    http_uri,
+                ],
+                text=True,
+                check=True,
+                capture_output=True,
+                timeout=MLFLOW_DOWNLOAD_CHUNK_TIMEOUT.get(),
+                env=env,
+            )
+        except (TimeoutExpired, CalledProcessError) as e:
+            raise MlflowException(
+                f"""
 ----- stdout -----
-{stdout}
+{e.stdout.strip()}
 
 ----- stderr -----
-{stderr}
+{e.stderr.strip()}
 """
-        with tempfile.TemporaryDirectory() as tmpdir:
-            json_file = os.path.join(tmpdir, "http_error.json")
-            try:
-                subprocess.run(
-                    [
-                        sys.executable,
-                        download_cloud_file_chunk.__file__,
-                        "--range-start",
-                        str(range_start),
-                        "--range-end",
-                        str(range_end),
-                        "--headers",
-                        json.dumps(headers or {}),
-                        "--download-path",
-                        download_path,
-                        "--http-uri",
-                        http_uri,
-                        "--temp-file",
-                        json_file,
-                    ],
-                    text=True,
-                    check=True,
-                    capture_output=True,
-                    timeout=MLFLOW_DOWNLOAD_CHUNK_TIMEOUT.get(),
-                    env=env,
-                )
-            except TimeoutExpired as e:
-                raise _ChunkDownloadError(
-                    True,
-                    template.format(
-                        stdout=e.stdout.strip() or "(no stdout)",
-                        stderr=e.stderr.strip() or "(no stderr)",
-                    ),
-                ) from e
-            except CalledProcessError as e:
-                retryable = False
-                status_code = None
-                if os.path.exists(json_file):
-                    with open(json_file) as f:
-                        data = json.load(f)
-                        retryable = data.get("retryable", False)
-                        status_code = data.get("status_code")
-                raise _ChunkDownloadError(
-                    retryable,
-                    template.format(
-                        stdout=e.stdout.strip() or "(no stdout)",
-                        stderr=e.stderr.strip() or "(no stderr)",
-                    ),
-                    status_code,
-                ) from e
-            except Exception as e:
-                raise _ChunkDownloadError(False, str(e)) from e
+            ) from e
 
-    num_requests = int(math.ceil(file_size / float(chunk_size)))
+    chunks = _yield_chunks(file_size, chunk_size)
     # Create file if it doesn't exist or erase the contents if it does. We should do this here
     # before sending to the workers so they can each individually seek to their respective positions
     # and write chunks without overwriting.
     with open(download_path, "w"):
         pass
-    starting_index = 0
     if uri_type == ArtifactCredentialType.GCP_SIGNED_URL or uri_type is None:
+        chunk = next(chunks)
         # GCP files could be transcoded, in which case the range header is ignored.
         # Test if this is the case by downloading one chunk and seeing if it's larger than the
         # requested size. If yes, let that be the file; if not, continue downloading more chunks.
         download_chunk(
-            range_start=0,
-            range_end=chunk_size - 1,
+            range_start=chunk.start,
+            range_end=chunk.end,
             headers=headers,
             download_path=download_path,
             http_uri=http_uri,
@@ -778,24 +754,16 @@ def run_download(range_start, range_end):
         # so we don't need to consider this here
         if downloaded_size > chunk_size:
             return {}
-        else:
-            starting_index = 1
-
-    futures = {}
-    for i in range(starting_index, num_requests):
-        range_start = i * chunk_size
-        range_end = range_start + chunk_size - 1
-        futures[thread_pool_executor.submit(run_download, range_start, range_end)] = i
 
+    futures = {thread_pool_executor.submit(run_download, chunk): chunk for chunk in chunks}
     failed_downloads = {}
-
     with ArtifactProgressBar.chunks(file_size, f"Downloading {download_path}", chunk_size) as pbar:
         for future in as_completed(futures):
-            index = futures[future]
+            chunk = futures[future]
             try:
                 future.result()
             except Exception:
-                failed_downloads[index] = future.exception()
+                failed_downloads[chunk] = future.exception()
             else:
                 pbar.update()
 
diff --git a/mlflow/utils/git_utils.py b/mlflow/utils/git_utils.py
index 07ec2509bd329..39efe97ccdb53 100644
--- a/mlflow/utils/git_utils.py
+++ b/mlflow/utils/git_utils.py
@@ -46,6 +46,8 @@ def get_git_commit(path: str) -> Optional[str]:
         if os.path.isfile(path):
             path = os.path.dirname(path)
         repo = Repo(path, search_parent_directories=True)
+        if path in repo.ignored(path):
+            return None
         return repo.head.commit.hexsha
     except Exception:
         return None
diff --git a/mlflow/utils/insecure_hash.py b/mlflow/utils/insecure_hash.py
new file mode 100644
index 0000000000000..7807a6d2e4ff3
--- /dev/null
+++ b/mlflow/utils/insecure_hash.py
@@ -0,0 +1,15 @@
+import functools
+import hashlib
+import sys
+
+# DO NOT use this function for security purposes (e.g., password hashing).
+#
+# In Python >= 3.9, insecure hashing algorithms such as MD5 fail in FIPS-compliant
+# environments unless `usedforsecurity=False` is explicitly passed.
+#
+# References:
+# - https://github.com/mlflow/mlflow/issues/9905
+# - https://docs.python.org/3/library/hashlib.html
+_kwargs = {"usedforsecurity": False} if sys.version_info >= (3, 9) else {}
+md5 = functools.partial(hashlib.md5, **_kwargs)
+sha1 = functools.partial(hashlib.sha1, **_kwargs)
diff --git a/mlflow/utils/request_utils.py b/mlflow/utils/request_utils.py
index f1d5687b3c040..713044299f380 100644
--- a/mlflow/utils/request_utils.py
+++ b/mlflow/utils/request_utils.py
@@ -39,7 +39,7 @@ def augmented_raise_for_status(response):
             raise e
 
 
-def download_chunk(range_start, range_end, headers, download_path, http_uri):
+def download_chunk(*, range_start, range_end, headers, download_path, http_uri):
     combined_headers = {**headers, "Range": f"bytes={range_start}-{range_end}"}
 
     with cloud_storage_http_request(
diff --git a/mlflow/utils/requirements_utils.py b/mlflow/utils/requirements_utils.py
index 9233e52a7303d..57e1e67324aae 100644
--- a/mlflow/utils/requirements_utils.py
+++ b/mlflow/utils/requirements_utils.py
@@ -330,6 +330,9 @@ def _capture_imported_modules(model_uri, flavor):
     "databricks.automl_runtime": ["databricks-automl-runtime"],
     "databricks.model_monitoring": ["databricks-model-monitoring"],
 }
+MLFLOW_MODULES_TO_PACKAGES = {
+    "mlflow.gateway": ["mlflow[gateway]"],
+}
 _MODULES_TO_PACKAGES = None
 _PACKAGES_TO_MODULES = None
 
@@ -342,6 +345,9 @@ def _init_modules_to_packages_map():
         # https://importlib-metadata.readthedocs.io/en/latest/using.html#using-importlib-metadata
         _MODULES_TO_PACKAGES = importlib_metadata.packages_distributions()
 
+        # Add mapping for MLFlow extras
+        _MODULES_TO_PACKAGES.update(MLFLOW_MODULES_TO_PACKAGES)
+
         # Multiple packages populate the `databricks` module namespace on Databricks; to avoid
         # bundling extraneous Databricks packages into model dependencies, we scope each module
         # to its relevant package
@@ -417,7 +423,9 @@ def _infer_requirements(model_uri, flavor):
         *_MODULES_TO_PACKAGES.get("mlflow", []),
     ]
     packages = packages - set(excluded_packages)
-    unrecognized_packages = packages - _PYPI_PACKAGE_INDEX.package_names
+
+    # manually exclude mlflow[gateway] as it isn't listed separately in PYPI_PACKAGE_INDEX
+    unrecognized_packages = packages - _PYPI_PACKAGE_INDEX.package_names - {"mlflow[gateway]"}
     if unrecognized_packages:
         _logger.warning(
             "The following packages were not found in the public PyPI package index as of"
@@ -426,6 +434,7 @@ def _infer_requirements(model_uri, flavor):
             _PYPI_PACKAGE_INDEX.date,
             unrecognized_packages,
         )
+
     return sorted(map(_get_pinned_requirement, packages))
 
 
@@ -462,18 +471,20 @@ def local(self):
         return version
 
 
-def _get_pinned_requirement(package, version=None, module=None, extras=None):
+def _get_pinned_requirement(req_str, version=None, module=None):
     """
     Returns a string representing a pinned pip requirement to install the specified package and
     version (e.g. 'mlflow==1.2.3').
 
-    :param package: The name of the package.
+    :param req_str: The package requirement string (e.g. "mlflow" or "mlflow[gateway]").
     :param version: The version of the package. If None, defaults to the installed version.
     :param module: The name of the top-level module provided by the package . For example,
                    if `package` is 'scikit-learn', `module` should be 'sklearn'. If None, defaults
                    to `package`.
     :param extras: A list of extra names for the package
     """
+    req = Requirement(req_str)
+    package = req.name
     if version is None:
         version_raw = _get_installed_version(package, module)
         local_version_label = _get_local_version_label(version_raw)
@@ -492,8 +503,8 @@ def _get_pinned_requirement(package, version=None, module=None, extras=None):
         else:
             version = version_raw
 
-    if extras:
-        return f"{package}[{','.join(extras)}]=={version}"
+    if req.extras:
+        return f"{package}[{','.join(req.extras)}]=={version}"
     return f"{package}=={version}"
 
 
@@ -533,6 +544,14 @@ def _check_requirement_satisfied(requirement_str):
             requirement=requirement_str,
         )
 
+    if pkg_name == "mlflow" and "gateway" in req.extras:
+        try:
+            from mlflow import gateway  # noqa: F401
+        except ModuleNotFoundError:
+            return _MismatchedPackageInfo(
+                package_name="mlflow[gateway]", installed_version=None, requirement=requirement_str
+            )
+
     if (
         pkg_name == "mlflow"
         and installed_version == mlflow.__version__
diff --git a/pyproject.toml b/pyproject.toml
index db7dd6df39029..3a6cb66174842 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,7 @@ select = [
   "RUF010",
   "RUF013",
   "S307",
+  "S324",
   "UP004",
   "UP008",
   "UP011",
@@ -59,6 +60,7 @@ select = [
   "T20",
   "TID252",
   "TID251",
+  "W",
 ]
 force-exclude = true
 ignore = [
diff --git a/requirements/core-requirements.txt b/requirements/core-requirements.txt
index 99f4cdd62ba2d..11f7e3770348b 100644
--- a/requirements/core-requirements.txt
+++ b/requirements/core-requirements.txt
@@ -4,7 +4,7 @@
 
 alembic<2,!=1.10.0
 docker<7,>=4.0.0
-Flask<3
+Flask<4
 numpy<2
 scipy<2
 pandas<3
diff --git a/requirements/core-requirements.yaml b/requirements/core-requirements.yaml
index 5447b07c77648..4ffb44c6b6d99 100644
--- a/requirements/core-requirements.yaml
+++ b/requirements/core-requirements.yaml
@@ -17,7 +17,7 @@ docker:
 
 flask:
   pip_release: Flask
-  max_major_version: 2
+  max_major_version: 3
 
 numpy:
   pip_release: numpy
diff --git a/requirements/doc-min-requirements.txt b/requirements/doc-min-requirements.txt
new file mode 100644
index 0000000000000..e73f9d360fa18
--- /dev/null
+++ b/requirements/doc-min-requirements.txt
@@ -0,0 +1,8 @@
+# sphinx >= 4.0.0 is incompatible with our custom CSS styles and renders the documents improperly.
+# See https://github.com/mlflow/mlflow/pull/4480
+sphinx==3.5.4
+jinja2==3.0.3
+# to be compatible with jinja2==3.0.3
+flask<=2.2.5
+sphinx-autobuild
+sphinx-click
diff --git a/requirements/doc-requirements.txt b/requirements/doc-requirements.txt
index ea7c429684c42..5327d4dde51c6 100644
--- a/requirements/doc-requirements.txt
+++ b/requirements/doc-requirements.txt
@@ -1,13 +1,8 @@
-# Dev/Deployment
-# sphinx >= 4.0.0 is incompatible with our custom CSS styles and renders the documents improperly.
-# See https://github.com/mlflow/mlflow/pull/4480
-sphinx==3.5.4
-jinja2==3.0.3
-# to be compatible with jinja2==3.0.3
-flask<=2.2.5
-sphinx-autobuild
-sphinx-click
+-r doc-min-requirements.txt
 tensorflow-cpu<=2.12.0
 pyspark
 datasets
 keras-core
+torch>=1.11.0
+torchvision>=0.12.0
+lightning>=1.8.1
diff --git a/requirements/extra-ml-requirements.txt b/requirements/extra-ml-requirements.txt
index a76cca72f0368..1b362e1689756 100644
--- a/requirements/extra-ml-requirements.txt
+++ b/requirements/extra-ml-requirements.txt
@@ -13,7 +13,7 @@ tensorflow-cpu>=2.8.0
 # Required by mlflow.pytorch
 torch>=1.11.0
 torchvision>=0.12.0
-pytorch_lightning>=1.5.10
+lightning>=1.8.1
 # Required by mlflow.xgboost
 xgboost>=0.82
 # Required by mlflow.lightgbm
diff --git a/tests/data/test_delta_dataset_source.py b/tests/data/test_delta_dataset_source.py
index e9d8e4f9f766e..b199e558b1d74 100644
--- a/tests/data/test_delta_dataset_source.py
+++ b/tests/data/test_delta_dataset_source.py
@@ -14,7 +14,7 @@ def spark_session():
 
     with (
         SparkSession.builder.master("local[*]")
-        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config(
             "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/data/test_pandas_dataset.py b/tests/data/test_pandas_dataset.py
index 2a452593e3e26..c8244eabbb2a7 100644
--- a/tests/data/test_pandas_dataset.py
+++ b/tests/data/test_pandas_dataset.py
@@ -24,7 +24,7 @@ def spark_session():
 
     with (
         SparkSession.builder.master("local[*]")
-        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config(
             "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/data/test_spark_dataset.py b/tests/data/test_spark_dataset.py
index ae2fc350587b4..2283f4bf30ca9 100644
--- a/tests/data/test_spark_dataset.py
+++ b/tests/data/test_spark_dataset.py
@@ -21,7 +21,7 @@ def spark_session(tmp_path):
 
     with (
         SparkSession.builder.master("local[*]")
-        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config(
             "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/data/test_spark_dataset_source.py b/tests/data/test_spark_dataset_source.py
index 7b68370f2696f..78c0e0ccfee9b 100644
--- a/tests/data/test_spark_dataset_source.py
+++ b/tests/data/test_spark_dataset_source.py
@@ -14,7 +14,7 @@ def spark_session():
 
     with (
         SparkSession.builder.master("local[*]")
-        .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+        .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
         .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
         .config(
             "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/db/check_migration.py b/tests/db/check_migration.py
index 104f1cf6ee43a..3bff39184671a 100644
--- a/tests/db/check_migration.py
+++ b/tests/db/check_migration.py
@@ -109,7 +109,7 @@ def post_migration():
         for table in TABLES:
             df_actual = pd.read_sql(sa.text(f"SELECT * FROM {table}"), conn)
             df_expected = pd.read_pickle(SNAPSHOTS_DIR / f"{table}.pkl")
-            pd.testing.assert_frame_equal(df_actual, df_expected)
+            pd.testing.assert_frame_equal(df_actual[df_expected.columns], df_expected)
 
 
 if __name__ == "__main__":
diff --git a/tests/db/schemas/mssql.sql b/tests/db/schemas/mssql.sql
index 4c033a444d344..8361adab9c319 100644
--- a/tests/db/schemas/mssql.sql
+++ b/tests/db/schemas/mssql.sql
@@ -16,6 +16,24 @@ CREATE TABLE experiments (
 )
 
 
+CREATE TABLE input_tags (
+	input_uuid VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	name VARCHAR(255) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	value VARCHAR(500) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	CONSTRAINT input_tags_pk PRIMARY KEY (input_uuid, name)
+)
+
+
+CREATE TABLE inputs (
+	input_uuid VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	source_type VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	source_id VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	destination_type VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	destination_id VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	CONSTRAINT inputs_pk PRIMARY KEY (source_type, source_id, destination_type, destination_id)
+)
+
+
 CREATE TABLE registered_models (
 	name VARCHAR(256) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
 	creation_time BIGINT,
@@ -25,6 +43,20 @@ CREATE TABLE registered_models (
 )
 
 
+CREATE TABLE datasets (
+	dataset_uuid VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	experiment_id INTEGER NOT NULL,
+	name VARCHAR(500) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	digest VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	dataset_source_type VARCHAR(36) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	dataset_source VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	dataset_schema VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS",
+	dataset_profile VARCHAR COLLATE "SQL_Latin1_General_CP1_CI_AS",
+	CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
+	CONSTRAINT "FK__datasets__experi__6477ECF3" FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
+)
+
+
 CREATE TABLE experiment_tags (
 	key VARCHAR(250) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
 	value VARCHAR(5000) COLLATE "SQL_Latin1_General_CP1_CI_AS",
@@ -47,11 +79,21 @@ CREATE TABLE model_versions (
 	status VARCHAR(20) COLLATE "SQL_Latin1_General_CP1_CI_AS",
 	status_message VARCHAR(500) COLLATE "SQL_Latin1_General_CP1_CI_AS",
 	run_link VARCHAR(500) COLLATE "SQL_Latin1_General_CP1_CI_AS",
+	storage_location VARCHAR(500) COLLATE "SQL_Latin1_General_CP1_CI_AS",
 	CONSTRAINT model_version_pk PRIMARY KEY (name, version),
 	CONSTRAINT "FK__model_vers__name__5812160E" FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE
 )
 
 
+CREATE TABLE registered_model_aliases (
+	alias VARCHAR(256) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	version INTEGER NOT NULL,
+	name VARCHAR(256) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
+	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
+	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON DELETE CASCADE ON UPDATE CASCADE
+)
+
+
 CREATE TABLE registered_model_tags (
 	key VARCHAR(250) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
 	value VARCHAR(5000) COLLATE "SQL_Latin1_General_CP1_CI_AS",
@@ -131,11 +173,3 @@ CREATE TABLE tags (
 	CONSTRAINT tag_pk PRIMARY KEY (key, run_uuid),
 	CONSTRAINT "FK__tags__run_uuid__412EB0B6" FOREIGN KEY(run_uuid) REFERENCES runs (run_uuid)
 )
-
-CREATE TABLE registered_model_aliases (
-	name VARCHAR(256) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
-	alias VARCHAR(256) COLLATE "SQL_Latin1_General_CP1_CI_AS" NOT NULL,
-	version INTEGER NOT NULL,
-	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
-	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE ON DELETE CASCADE
-)
diff --git a/tests/db/schemas/mysql.sql b/tests/db/schemas/mysql.sql
index 31b9cd857da2d..757201f205dfd 100644
--- a/tests/db/schemas/mysql.sql
+++ b/tests/db/schemas/mysql.sql
@@ -17,6 +17,24 @@ CREATE TABLE experiments (
 )
 
 
+CREATE TABLE input_tags (
+	input_uuid VARCHAR(36) NOT NULL,
+	name VARCHAR(255) NOT NULL,
+	value VARCHAR(500) NOT NULL,
+	PRIMARY KEY (input_uuid, name)
+)
+
+
+CREATE TABLE inputs (
+	input_uuid VARCHAR(36) NOT NULL,
+	source_type VARCHAR(36) NOT NULL,
+	source_id VARCHAR(36) NOT NULL,
+	destination_type VARCHAR(36) NOT NULL,
+	destination_id VARCHAR(36) NOT NULL,
+	PRIMARY KEY (source_type, source_id, destination_type, destination_id)
+)
+
+
 CREATE TABLE registered_models (
 	name VARCHAR(256) NOT NULL,
 	creation_time BIGINT,
@@ -26,6 +44,20 @@ CREATE TABLE registered_models (
 )
 
 
+CREATE TABLE datasets (
+	dataset_uuid VARCHAR(36) NOT NULL,
+	experiment_id INTEGER NOT NULL,
+	name VARCHAR(500) NOT NULL,
+	digest VARCHAR(36) NOT NULL,
+	dataset_source_type VARCHAR(36) NOT NULL,
+	dataset_source TEXT NOT NULL,
+	dataset_schema TEXT,
+	dataset_profile MEDIUMTEXT,
+	PRIMARY KEY (experiment_id, name, digest),
+	CONSTRAINT datasets_ibfk_1 FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
+)
+
+
 CREATE TABLE experiment_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -48,11 +80,21 @@ CREATE TABLE model_versions (
 	status VARCHAR(20),
 	status_message VARCHAR(500),
 	run_link VARCHAR(500),
+	storage_location VARCHAR(500),
 	PRIMARY KEY (name, version),
 	CONSTRAINT model_versions_ibfk_1 FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE
 )
 
 
+CREATE TABLE registered_model_aliases (
+	alias VARCHAR(256) NOT NULL,
+	version INTEGER NOT NULL,
+	name VARCHAR(256) NOT NULL,
+	PRIMARY KEY (name, alias),
+	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON DELETE CASCADE ON UPDATE CASCADE
+)
+
+
 CREATE TABLE registered_model_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -138,12 +180,3 @@ CREATE TABLE tags (
 	PRIMARY KEY (key, run_uuid),
 	CONSTRAINT tags_ibfk_1 FOREIGN KEY(run_uuid) REFERENCES runs (run_uuid)
 )
-
-
-CREATE TABLE registered_model_aliases (
-	name VARCHAR(256) NOT NULL,
-	alias VARCHAR(256) NOT NULL,
-	version INTEGER NOT NULL,
-	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
-	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE ON DELETE CASCADE
-)
diff --git a/tests/db/schemas/postgresql.sql b/tests/db/schemas/postgresql.sql
index 223c8cb9b8730..158fe88288698 100644
--- a/tests/db/schemas/postgresql.sql
+++ b/tests/db/schemas/postgresql.sql
@@ -14,7 +14,25 @@ CREATE TABLE experiments (
 	last_update_time BIGINT,
 	CONSTRAINT experiment_pk PRIMARY KEY (experiment_id),
 	CONSTRAINT experiments_name_key UNIQUE (name),
-	CONSTRAINT experiments_lifecycle_stage CHECK ((lifecycle_stage)::text = ANY ((ARRAY['active'::character varying, 'deleted'::character varying])::text[]))
+	CONSTRAINT experiments_lifecycle_stage CHECK (lifecycle_stage::text = ANY (ARRAY['active'::character varying, 'deleted'::character varying]::text[]))
+)
+
+
+CREATE TABLE input_tags (
+	input_uuid VARCHAR(36) NOT NULL,
+	name VARCHAR(255) NOT NULL,
+	value VARCHAR(500) NOT NULL,
+	CONSTRAINT input_tags_pk PRIMARY KEY (input_uuid, name)
+)
+
+
+CREATE TABLE inputs (
+	input_uuid VARCHAR(36) NOT NULL,
+	source_type VARCHAR(36) NOT NULL,
+	source_id VARCHAR(36) NOT NULL,
+	destination_type VARCHAR(36) NOT NULL,
+	destination_id VARCHAR(36) NOT NULL,
+	CONSTRAINT inputs_pk PRIMARY KEY (source_type, source_id, destination_type, destination_id)
 )
 
 
@@ -27,6 +45,20 @@ CREATE TABLE registered_models (
 )
 
 
+CREATE TABLE datasets (
+	dataset_uuid VARCHAR(36) NOT NULL,
+	experiment_id INTEGER NOT NULL,
+	name VARCHAR(500) NOT NULL,
+	digest VARCHAR(36) NOT NULL,
+	dataset_source_type VARCHAR(36) NOT NULL,
+	dataset_source TEXT NOT NULL,
+	dataset_schema TEXT,
+	dataset_profile TEXT,
+	CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
+	CONSTRAINT datasets_experiment_id_fkey FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
+)
+
+
 CREATE TABLE experiment_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -49,11 +81,21 @@ CREATE TABLE model_versions (
 	status VARCHAR(20),
 	status_message VARCHAR(500),
 	run_link VARCHAR(500),
+	storage_location VARCHAR(500),
 	CONSTRAINT model_version_pk PRIMARY KEY (name, version),
 	CONSTRAINT model_versions_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE
 )
 
 
+CREATE TABLE registered_model_aliases (
+	alias VARCHAR(256) NOT NULL,
+	version INTEGER NOT NULL,
+	name VARCHAR(256) NOT NULL,
+	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
+	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON DELETE CASCADE ON UPDATE CASCADE
+)
+
+
 CREATE TABLE registered_model_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -80,15 +122,15 @@ CREATE TABLE runs (
 	deleted_time BIGINT,
 	CONSTRAINT run_pk PRIMARY KEY (run_uuid),
 	CONSTRAINT runs_experiment_id_fkey FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id),
-	CONSTRAINT source_type CHECK ((source_type)::text = ANY ((ARRAY['NOTEBOOK'::character varying, 'JOB'::character varying, 'LOCAL'::character varying, 'UNKNOWN'::character varying, 'PROJECT'::character varying])::text[])),
-	CONSTRAINT runs_lifecycle_stage CHECK ((lifecycle_stage)::text = ANY ((ARRAY['active'::character varying, 'deleted'::character varying])::text[])),
-	CONSTRAINT runs_status_check CHECK ((status)::text = ANY ((ARRAY['SCHEDULED'::character varying, 'FAILED'::character varying, 'FINISHED'::character varying, 'RUNNING'::character varying, 'KILLED'::character varying])::text[]))
+	CONSTRAINT runs_lifecycle_stage CHECK (lifecycle_stage::text = ANY (ARRAY['active'::character varying, 'deleted'::character varying]::text[])),
+	CONSTRAINT runs_status_check CHECK (status::text = ANY (ARRAY['SCHEDULED'::character varying, 'FAILED'::character varying, 'FINISHED'::character varying, 'RUNNING'::character varying, 'KILLED'::character varying]::text[])),
+	CONSTRAINT source_type CHECK (source_type::text = ANY (ARRAY['NOTEBOOK'::character varying, 'JOB'::character varying, 'LOCAL'::character varying, 'UNKNOWN'::character varying, 'PROJECT'::character varying]::text[]))
 )
 
 
 CREATE TABLE latest_metrics (
 	key VARCHAR(250) NOT NULL,
-	value DOUBLE_PRECISION NOT NULL,
+	value DOUBLE PRECISION NOT NULL,
 	timestamp BIGINT,
 	step BIGINT NOT NULL,
 	is_nan BOOLEAN NOT NULL,
@@ -100,7 +142,7 @@ CREATE TABLE latest_metrics (
 
 CREATE TABLE metrics (
 	key VARCHAR(250) NOT NULL,
-	value DOUBLE_PRECISION NOT NULL,
+	value DOUBLE PRECISION NOT NULL,
 	timestamp BIGINT NOT NULL,
 	run_uuid VARCHAR(32) NOT NULL,
 	step BIGINT DEFAULT '0'::bigint NOT NULL,
@@ -136,12 +178,3 @@ CREATE TABLE tags (
 	CONSTRAINT tag_pk PRIMARY KEY (key, run_uuid),
 	CONSTRAINT tags_run_uuid_fkey FOREIGN KEY(run_uuid) REFERENCES runs (run_uuid)
 )
-
-
-CREATE TABLE registered_model_aliases (
-	name VARCHAR(256) NOT NULL,
-	alias VARCHAR(256) NOT NULL,
-	version INTEGER NOT NULL,
-	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
-	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE ON DELETE CASCADE
-)
diff --git a/tests/db/schemas/sqlite.sql b/tests/db/schemas/sqlite.sql
index 22162210ef130..09e310ea2e99c 100644
--- a/tests/db/schemas/sqlite.sql
+++ b/tests/db/schemas/sqlite.sql
@@ -18,6 +18,24 @@ CREATE TABLE experiments (
 )
 
 
+CREATE TABLE input_tags (
+	input_uuid VARCHAR(36) NOT NULL,
+	name VARCHAR(255) NOT NULL,
+	value VARCHAR(500) NOT NULL,
+	CONSTRAINT input_tags_pk PRIMARY KEY (input_uuid, name)
+)
+
+
+CREATE TABLE inputs (
+	input_uuid VARCHAR(36) NOT NULL,
+	source_type VARCHAR(36) NOT NULL,
+	source_id VARCHAR(36) NOT NULL,
+	destination_type VARCHAR(36) NOT NULL,
+	destination_id VARCHAR(36) NOT NULL,
+	CONSTRAINT inputs_pk PRIMARY KEY (source_type, source_id, destination_type, destination_id)
+)
+
+
 CREATE TABLE registered_models (
 	name VARCHAR(256) NOT NULL,
 	creation_time BIGINT,
@@ -28,6 +46,20 @@ CREATE TABLE registered_models (
 )
 
 
+CREATE TABLE datasets (
+	dataset_uuid VARCHAR(36) NOT NULL,
+	experiment_id INTEGER NOT NULL,
+	name VARCHAR(500) NOT NULL,
+	digest VARCHAR(36) NOT NULL,
+	dataset_source_type VARCHAR(36) NOT NULL,
+	dataset_source TEXT NOT NULL,
+	dataset_schema TEXT,
+	dataset_profile TEXT,
+	CONSTRAINT dataset_pk PRIMARY KEY (experiment_id, name, digest),
+	FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id)
+)
+
+
 CREATE TABLE experiment_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -50,11 +82,21 @@ CREATE TABLE model_versions (
 	status VARCHAR(20),
 	status_message VARCHAR(500),
 	run_link VARCHAR(500),
+	storage_location VARCHAR(500),
 	CONSTRAINT model_version_pk PRIMARY KEY (name, version),
 	FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE
 )
 
 
+CREATE TABLE registered_model_aliases (
+	alias VARCHAR(256) NOT NULL,
+	version INTEGER NOT NULL,
+	name VARCHAR(256) NOT NULL,
+	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
+	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON DELETE CASCADE ON UPDATE CASCADE
+)
+
+
 CREATE TABLE registered_model_tags (
 	key VARCHAR(250) NOT NULL,
 	value VARCHAR(5000),
@@ -81,8 +123,8 @@ CREATE TABLE runs (
 	deleted_time BIGINT,
 	CONSTRAINT run_pk PRIMARY KEY (run_uuid),
 	FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id),
-	CONSTRAINT source_type CHECK (source_type IN ('NOTEBOOK', 'JOB', 'LOCAL', 'UNKNOWN', 'PROJECT')),
 	CONSTRAINT runs_lifecycle_stage CHECK (lifecycle_stage IN ('active', 'deleted')),
+	CONSTRAINT source_type CHECK (source_type IN ('NOTEBOOK', 'JOB', 'LOCAL', 'UNKNOWN', 'PROJECT')),
 	CHECK (status IN ('SCHEDULED', 'FAILED', 'FINISHED', 'RUNNING', 'KILLED'))
 )
 
@@ -139,12 +181,3 @@ CREATE TABLE tags (
 	CONSTRAINT tag_pk PRIMARY KEY (key, run_uuid),
 	FOREIGN KEY(run_uuid) REFERENCES runs (run_uuid)
 )
-
-
-CREATE TABLE registered_model_aliases (
-	name VARCHAR(256) NOT NULL,
-	alias VARCHAR(256) NOT NULL,
-	version INTEGER NOT NULL,
-	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
-	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE ON DELETE CASCADE
-)
diff --git a/tests/evaluate/test_default_evaluator.py b/tests/evaluate/test_default_evaluator.py
index ce6ffa30883a9..06c81a733d7e1 100644
--- a/tests/evaluate/test_default_evaluator.py
+++ b/tests/evaluate/test_default_evaluator.py
@@ -26,6 +26,7 @@
     MetricValue,
     make_metric,
 )
+from mlflow.metrics.genai import model_utils
 from mlflow.models import Model
 from mlflow.models.evaluation.artifacts import (
     CsvEvaluationArtifact,
@@ -1348,14 +1349,12 @@ def old_fn(eval_df, builtin_metrics):
     assert res_metric.justifications is None
     assert res_metric.aggregate_results["old_fn"] == builtin_metrics["mean_absolute_error"] * 1.5
 
-    new_eval_fn_args = [eval_df, metrics]
+    new_eval_fn_args = [eval_df, None, metrics]
 
-    def new_fn_with_type_hint(eval_df, metrics: Dict[str, MetricValue]):
+    def new_fn(predictions, targets=None, metrics=None):
         return metrics["mean_absolute_error"].aggregate_results["mean_absolute_error"] * 1.5
 
-    res_metric = _evaluate_extra_metric(
-        _CustomMetric(new_fn_with_type_hint, "new_fn", 0), new_eval_fn_args
-    )
+    res_metric = _evaluate_extra_metric(_CustomMetric(new_fn, "new_fn", 0), new_eval_fn_args)
     assert res_metric.scores is None
     assert res_metric.justifications is None
     assert res_metric.aggregate_results["new_fn"] == builtin_metrics["mean_absolute_error"] * 1.5
@@ -1503,10 +1502,10 @@ def test_evaluate_custom_metric_success():
         eval_df["target"], eval_df["prediction"], sample_weights=None
     )
 
-    def example_count_times_1_point_5(eval_df, metrics: Dict[str, MetricValue]):
+    def example_count_times_1_point_5(predictions, targets=None, metrics=None):
         return MetricValue(
-            scores=[score * 1.5 for score in eval_df["prediction"].tolist()],
-            justifications=["justification"] * len(eval_df["prediction"]),
+            scores=[score * 1.5 for score in predictions.tolist()],
+            justifications=["justification"] * len(predictions),
             aggregate_results={
                 "example_count_times_1_point_5": metrics["example_count"].aggregate_results[
                     "example_count"
@@ -1515,7 +1514,7 @@ def example_count_times_1_point_5(eval_df, metrics: Dict[str, MetricValue]):
             },
         )
 
-    eval_fn_args = [eval_df, _get_aggregate_metrics_values(builtin_metrics)]
+    eval_fn_args = [eval_df["prediction"], None, _get_aggregate_metrics_values(builtin_metrics)]
     res_metric = _evaluate_extra_metric(
         _CustomMetric(example_count_times_1_point_5, "", 0), eval_fn_args
     )
@@ -1592,7 +1591,7 @@ def example_custom_artifact_2(_, __, ___):
 
 
 def test_custom_metric_mixed(binary_logistic_regressor_model_uri, breast_cancer_dataset):
-    def true_count(_eval_df, metrics: Dict[str, MetricValue]):
+    def true_count(predictions, targets=None, metrics=None):
         true_negatives = metrics["true_negatives"].aggregate_results["true_negatives"]
         true_positives = metrics["true_positives"].aggregate_results["true_positives"]
         return MetricValue(aggregate_results={"true_count": true_negatives + true_positives})
@@ -2143,14 +2142,15 @@ def language_model(inputs: list[str]) -> list[str]:
     return inputs
 
 
-def validate_question_answering_logged_data(logged_data, with_targets=True):
+def validate_question_answering_logged_data(
+    logged_data, with_targets=True, predictions_name="outputs"
+):
     columns = {
         "question",
-        "outputs",
+        predictions_name,
         "toxicity/v1/score",
         "flesch_kincaid_grade_level/v1/score",
         "ari_grade_level/v1/score",
-        "perplexity/v1/score",
         "token_count",
     }
     if with_targets:
@@ -2159,10 +2159,9 @@ def validate_question_answering_logged_data(logged_data, with_targets=True):
     assert set(logged_data.columns.tolist()) == columns
 
     assert logged_data["question"].tolist() == ["words random", "This is a sentence."]
-    assert logged_data["outputs"].tolist() == ["words random", "This is a sentence."]
+    assert logged_data[predictions_name].tolist() == ["words random", "This is a sentence."]
     assert logged_data["toxicity/v1/score"][0] < 0.5
     assert logged_data["toxicity/v1/score"][1] < 0.5
-    assert logged_data["perplexity/v1/score"][0] > logged_data["perplexity/v1/score"][1]
     assert all(
         isinstance(grade, float) for grade in logged_data["flesch_kincaid_grade_level/v1/score"]
     )
@@ -2173,8 +2172,11 @@ def validate_question_answering_logged_data(logged_data, with_targets=True):
         assert logged_data["answer"].tolist() == ["words random", "This is a sentence."]
 
 
-def test_custom_metrics_deprecated():
-    def dummy_fn(eval_df, metrics):
+def test_missing_args_raises_exception():
+    def dummy_fn1(param_1, param_2, targets, metrics):
+        pass
+
+    def dummy_fn2(param_3, param_4, builtin_metrics):
         pass
 
     with mlflow.start_run():
@@ -2183,17 +2185,56 @@ def dummy_fn(eval_df, metrics):
         )
         data = pd.DataFrame({"question": ["a", "b"], "answer": ["a", "b"]})
 
+    metric_1 = make_metric(name="metric_1", eval_fn=dummy_fn1, greater_is_better=True)
+    metric_2 = make_metric(name="metric_2", eval_fn=dummy_fn2, greater_is_better=True)
+
+    error_message = (
+        r"Error: Metric calculation failed for the following metrics:\n"
+        r"Metric 'metric_1' requires the columns \['param_1', 'param_2'\]\n"
+        r"Metric 'metric_2' requires the columns \['param_3', 'builtin_metrics'\]\n\n"
+        r"Below are the existing column names for the input/output data:\n"
+        r"Input Columns: \['question'\]\n"
+        r"Output Columns: \[\]\n"
+        r"To resolve this issue, you may want to map the missing column to an existing column\n"
+        r"using the following configuration:\n"
+        r"evaluator_config=\{'col_mapping': \{<missing column name>: <existing column name>\}\}"
+    )
+
     with pytest.raises(
         MlflowException,
-        match="The 'custom_metrics' parameter in mlflow.evaluate is deprecated. Please update "
-        "your code to only use the 'extra_metrics' parameter instead.",
+        match=error_message,
     ):
         with mlflow.start_run():
             mlflow.evaluate(
                 model_info.model_uri,
                 data,
                 targets="answer",
+                evaluators="default",
                 model_type="question-answering",
+                extra_metrics=[metric_1, metric_2],
+                evaluator_config={"col_mapping": {"param_4": "question"}},
+            )
+
+
+def test_custom_metrics_deprecated(
+    binary_logistic_regressor_model_uri,
+    breast_cancer_dataset,
+):
+    def dummy_fn(eval_df, metrics):
+        pass
+
+    with pytest.raises(
+        MlflowException,
+        match="The 'custom_metrics' parameter in mlflow.evaluate is deprecated. Please update "
+        "your code to only use the 'extra_metrics' parameter instead.",
+    ):
+        with mlflow.start_run():
+            mlflow.evaluate(
+                binary_logistic_regressor_model_uri,
+                breast_cancer_dataset._constructor_args["data"],
+                targets=breast_cancer_dataset._constructor_args["targets"],
+                evaluators="default",
+                model_type="classifier",
                 custom_metrics=[make_metric(eval_fn=dummy_fn, greater_is_better=True)],
                 extra_metrics=[make_metric(eval_fn=dummy_fn, greater_is_better=True)],
             )
@@ -2203,10 +2244,11 @@ def dummy_fn(eval_df, metrics):
     with pytest.warns(FutureWarning, match=message):
         with mlflow.start_run():
             mlflow.evaluate(
-                model_info.model_uri,
-                data,
-                targets="answer",
-                model_type="question-answering",
+                binary_logistic_regressor_model_uri,
+                breast_cancer_dataset._constructor_args["data"],
+                targets=breast_cancer_dataset._constructor_args["targets"],
+                evaluators="default",
+                model_type="classifier",
                 custom_metrics=[make_metric(eval_fn=dummy_fn, greater_is_better=True)],
             )
 
@@ -2260,16 +2302,13 @@ def test_evaluate_question_answering_on_static_dataset_with_targets():
     artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
     assert "eval_results_table.json" in artifacts
     logged_data = pd.DataFrame(**results.artifacts["eval_results_table"].content)
-    validate_question_answering_logged_data(logged_data)
+    validate_question_answering_logged_data(logged_data, predictions_name="pred")
     assert set(results.metrics.keys()) == {
         "toxicity/v1/variance",
-        "perplexity/v1/p90",
-        "perplexity/v1/variance",
         "toxicity/v1/ratio",
         "toxicity/v1/mean",
         "flesch_kincaid_grade_level/v1/variance",
         "ari_grade_level/v1/p90",
-        "perplexity/v1/mean",
         "flesch_kincaid_grade_level/v1/p90",
         "flesch_kincaid_grade_level/v1/mean",
         "ari_grade_level/v1/mean",
@@ -2338,7 +2377,6 @@ def validate_text_summarization_logged_data(logged_data, with_targets=True):
         "toxicity/v1/score",
         "flesch_kincaid_grade_level/v1/score",
         "ari_grade_level/v1/score",
-        "perplexity/v1/score",
         "token_count",
     }
     if with_targets:
@@ -2373,7 +2411,7 @@ def validate_text_summarization_logged_data(logged_data, with_targets=True):
 
 
 def get_text_metrics_keys():
-    metric_names = ["perplexity", "toxicity", "flesch_kincaid_grade_level", "ari_grade_level"]
+    metric_names = ["toxicity", "flesch_kincaid_grade_level", "ari_grade_level"]
     standard_aggregations = ["mean", "variance", "p90"]
     version = "v1"
 
@@ -2477,13 +2515,20 @@ def test_evaluate_text_summarization_without_targets():
 
 
 def test_evaluate_text_summarization_fails_to_load_evaluate_metrics():
+    from mlflow.metrics.metric_definitions import _cached_evaluate_load
+
+    _cached_evaluate_load.cache_clear()
+
     with mlflow.start_run() as run:
         model_info = mlflow.pyfunc.log_model(
             artifact_path="model", python_model=language_model, input_example=["a", "b"]
         )
 
         data = pd.DataFrame({"text": ["a", "b"], "summary": ["a", "b"]})
-        with mock.patch("evaluate.load", side_effect=ImportError("mocked error")) as mock_load:
+        with mock.patch(
+            "mlflow.metrics.metric_definitions._cached_evaluate_load",
+            side_effect=ImportError("mocked error"),
+        ) as mock_load:
             results = mlflow.evaluate(
                 model_info.model_uri,
                 data,
@@ -2491,7 +2536,6 @@ def test_evaluate_text_summarization_fails_to_load_evaluate_metrics():
                 model_type="text-summarization",
             )
             mock_load.assert_any_call("rouge")
-            mock_load.assert_any_call("perplexity", module_type="metric")
             mock_load.assert_any_call("toxicity", module_type="measurement")
 
     client = mlflow.MlflowClient()
@@ -2533,7 +2577,6 @@ def test_evaluate_text_and_text_metrics():
         "toxicity/v1/score",
         "flesch_kincaid_grade_level/v1/score",
         "ari_grade_level/v1/score",
-        "perplexity/v1/score",
         "token_count",
     }
     assert logged_data["text"].tolist() == ["sentence not", "All women are bad."]
@@ -2541,15 +2584,13 @@ def test_evaluate_text_and_text_metrics():
     # Hateful sentiments should be marked as toxic
     assert logged_data["toxicity/v1/score"][0] < 0.5
     assert logged_data["toxicity/v1/score"][1] > 0.5
-    # The perplexity of random words should be higher than a valid sentence.
-    assert logged_data["perplexity/v1/score"][0] > logged_data["perplexity/v1/score"][1]
     # Simple sentences should have a low grade level.
     assert logged_data["flesch_kincaid_grade_level/v1/score"][1] < 4
     assert logged_data["ari_grade_level/v1/score"][1] < 4
     assert set(results.metrics.keys()) == set(get_text_metrics_keys())
 
 
-def very_toxic(eval_df, metrics: Dict[str, MetricValue]):
+def very_toxic(predictions, targets=None, metrics=None):
     new_scores = [1.0 if score > 0.9 else 0.0 for score in metrics["toxicity/v1"].scores]
     return MetricValue(
         scores=new_scores,
@@ -2558,8 +2599,8 @@ def very_toxic(eval_df, metrics: Dict[str, MetricValue]):
     )
 
 
-def per_row_metric(eval_df, metrics: Dict[str, MetricValue]):
-    return MetricValue(scores=[1] * len(eval_df["prediction"]))
+def per_row_metric(predictions, targets=None, metrics=None):
+    return MetricValue(scores=[1] * len(predictions))
 
 
 def test_evaluate_text_custom_metrics():
@@ -2626,7 +2667,6 @@ def test_eval_results_table_json_can_be_prefixed_with_metric_prefix(metric_prefi
         f"{metric_prefix}toxicity/v1/score",
         f"{metric_prefix}flesch_kincaid_grade_level/v1/score",
         f"{metric_prefix}ari_grade_level/v1/score",
-        f"{metric_prefix}perplexity/v1/score",
         f"{metric_prefix}token_count",
     }
 
@@ -2757,6 +2797,7 @@ def test_eval_df(predictions, targets, metrics, inputs, truth, context):
             model_info.model_uri,
             data,
             targets="targets",
+            predictions="output",
             model_type="text",
             extra_metrics=[make_metric(eval_fn=test_eval_df, greater_is_better=True)],
             custom_artifacts=[example_custom_artifact],
@@ -2772,9 +2813,9 @@ def test_eval_df(predictions, targets, metrics, inputs, truth, context):
         "truth",
         "targets",
         "outputs",
+        "context",
         "token_count",
         "toxicity/v1/score",
-        "perplexity/v1/score",
         "flesch_kincaid_grade_level/v1/score",
         "ari_grade_level/v1/score",
     ]
@@ -2805,18 +2846,19 @@ def test_evaluate_no_model_type_with_builtin_metric():
         results = mlflow.evaluate(
             model_info.model_uri,
             data,
-            extra_metrics=[mlflow.metrics.perplexity],
+            extra_metrics=[mlflow.metrics.toxicity()],
         )
         assert results.metrics.keys() == {
-            "perplexity/v1/mean",
-            "perplexity/v1/variance",
-            "perplexity/v1/p90",
+            "toxicity/v1/mean",
+            "toxicity/v1/variance",
+            "toxicity/v1/p90",
+            "toxicity/v1/ratio",
         }
         assert len(results.tables) == 1
         assert results.tables["eval_results_table"].columns.tolist() == [
             "text",
             "outputs",
-            "perplexity/v1/score",
+            "toxicity/v1/score",
         ]
 
 
@@ -2829,7 +2871,7 @@ def test_evaluate_no_model_type_with_custom_metric():
         from mlflow.metrics import make_metric
         from mlflow.metrics.metric_definitions import standard_aggregations
 
-        def word_count_eval(predictions, targets, metrics):
+        def word_count_eval(predictions, targets=None, metrics=None):
             scores = []
             for prediction in predictions:
                 scores.append(len(prediction.split(" ")))
@@ -2855,61 +2897,139 @@ def word_count_eval(predictions, targets, metrics):
         ]
 
 
-def identity_model(inputs):
-    return inputs
+def multi_output_model(inputs):
+    return pd.DataFrame(
+        {
+            "answer": ["words random", "This is a sentence."],
+            "source": ["words random", "This is a sentence."],
+        }
+    )
 
 
 def test_default_metrics_as_custom_metrics():
     with mlflow.start_run() as run:
         model_info = mlflow.pyfunc.log_model(
-            artifact_path="model", python_model=identity_model, input_example=["a", "b"]
+            artifact_path="model", python_model=multi_output_model, input_example=["a"]
         )
         data = pd.DataFrame(
             {
                 "question": ["words random", "This is a sentence."],
                 "truth": ["words random", "This is a sentence."],
-                "answer": ["words random", "This is a sentence."],
             }
         )
         results = evaluate(
             model_info.model_uri,
             data,
             targets="truth",
+            predictions="answer",
             model_type="question-answering",
             custom_metrics=[
-                mlflow.metrics.flesch_kincaid_grade_level,
-                mlflow.metrics.perplexity,
-                mlflow.metrics.ari_grade_level,
-                mlflow.metrics.toxicity,
-                mlflow.metrics.exact_match,
+                mlflow.metrics.exact_match(),
             ],
             evaluators="default",
-            evaluator_config={
-                "predicted_column": "answer",
-            },
         )
 
     client = mlflow.MlflowClient()
     artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
     assert "eval_results_table.json" in artifacts
-    for metric in ["toxicity", "perplexity", "ari_grade_level", "flesch_kincaid_grade_level"]:
+    assert "exact_match/v1" in results.metrics.keys()
+
+
+def test_default_metrics_as_custom_metrics_static_dataset():
+    with mlflow.start_run() as run:
+        data = pd.DataFrame(
+            {
+                "question": ["words random", "This is a sentence."],
+                "truth": ["words random", "This is a sentence."],
+                "answer": ["words random", "This is a sentence."],
+                "source": ["words random", "This is a sentence."],
+            }
+        )
+        results = evaluate(
+            data=data,
+            targets="truth",
+            predictions="answer",
+            model_type="question-answering",
+            custom_metrics=[
+                mlflow.metrics.flesch_kincaid_grade_level(),
+                mlflow.metrics.ari_grade_level(),
+                mlflow.metrics.toxicity(),
+                mlflow.metrics.exact_match(),
+            ],
+            evaluators="default",
+        )
+
+    client = mlflow.MlflowClient()
+    artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
+    assert "eval_results_table.json" in artifacts
+    for metric in ["toxicity", "ari_grade_level", "flesch_kincaid_grade_level"]:
         for measure in ["mean", "p90", "variance"]:
             assert f"{metric}/v1/{measure}" in results.metrics.keys()
     assert "exact_match/v1" in results.metrics.keys()
 
 
+def test_multi_output_model_error_handling():
+    with mlflow.start_run():
+        model_info = mlflow.pyfunc.log_model(
+            artifact_path="model", python_model=multi_output_model, input_example=["a"]
+        )
+        data = pd.DataFrame(
+            {
+                "question": ["words random", "This is a sentence."],
+                "truth": ["words random", "This is a sentence."],
+            }
+        )
+        with pytest.raises(
+            MlflowException,
+            match="Output column name is not specified for the multi-output model.",
+        ):
+            evaluate(
+                model_info.model_uri,
+                data,
+                targets="truth",
+                model_type="question-answering",
+                custom_metrics=[
+                    mlflow.metrics.flesch_kincaid_grade_level(),
+                    mlflow.metrics.ari_grade_level(),
+                    mlflow.metrics.toxicity(),
+                    mlflow.metrics.exact_match(),
+                ],
+                evaluators="default",
+            )
+
+
+def test_invalid_extra_metrics():
+    with mlflow.start_run():
+        model_info = mlflow.pyfunc.log_model(
+            artifact_path="model", python_model=language_model, input_example=["a", "b"]
+        )
+        data = pd.DataFrame({"text": ["Hello world", "My name is MLflow"]})
+        with pytest.raises(
+            MlflowException,
+            match="Please ensure that all extra metrics are instances of "
+            "mlflow.metrics.EvaluationMetric.",
+        ):
+            mlflow.evaluate(
+                model_info.model_uri,
+                data,
+                model_type="text",
+                evaluators="default",
+                extra_metrics=[mlflow.metrics.latency],
+            )
+
+
 def test_evaluate_with_latency():
     with mlflow.start_run() as run:
         model_info = mlflow.pyfunc.log_model(
             artifact_path="model", python_model=language_model, input_example=["a", "b"]
         )
-        data = pd.DataFrame({"text": ["sentence not", "All women are bad."]})
+        data = pd.DataFrame({"text": ["sentence not", "Hello world."]})
         results = mlflow.evaluate(
             model_info.model_uri,
             data,
             model_type="text",
             evaluators="default",
-            extra_metrics=[mlflow.metrics.latency],
+            extra_metrics=[mlflow.metrics.latency()],
         )
 
     client = mlflow.MlflowClient()
@@ -2922,8 +3042,478 @@ def test_evaluate_with_latency():
         "toxicity/v1/score",
         "flesch_kincaid_grade_level/v1/score",
         "ari_grade_level/v1/score",
-        "perplexity/v1/score",
         "latency",
         "token_count",
     }
     assert all(isinstance(grade, float) for grade in logged_data["latency"])
+
+
+def test_evaluate_with_latency_static_dataset():
+    with mlflow.start_run() as run:
+        mlflow.pyfunc.log_model(
+            artifact_path="model", python_model=language_model, input_example=["a", "b"]
+        )
+        data = pd.DataFrame(
+            {
+                "text": ["foo", "bar"],
+                "model_output": ["FOO", "BAR"],
+            }
+        )
+        results = mlflow.evaluate(
+            data=data,
+            model_type="text",
+            evaluators="default",
+            predictions="model_output",
+            extra_metrics=[mlflow.metrics.latency()],
+        )
+
+    client = mlflow.MlflowClient()
+    artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
+    assert "eval_results_table.json" in artifacts
+    logged_data = pd.DataFrame(**results.artifacts["eval_results_table"].content)
+    assert set(logged_data.columns.tolist()) == {
+        "text",
+        "outputs",
+        "toxicity/v1/score",
+        "flesch_kincaid_grade_level/v1/score",
+        "ari_grade_level/v1/score",
+        "latency",
+        "token_count",
+    }
+    assert all(isinstance(grade, float) for grade in logged_data["latency"])
+    assert all(grade == 0.0 for grade in logged_data["latency"])
+
+
+properly_formatted_openai_response1 = {
+    "candidates": [
+        {
+            "text": '{\n  "score": 3,\n  "justification": "' "justification" '"\n}',
+            "metadata": {"finish_reason": "stop"},
+        }
+    ],
+    "metadata": {
+        "input_tokens": 569,
+        "output_tokens": 93,
+        "total_tokens": 662,
+        "model": "gpt-3.5-turbo-0613",
+        "route_type": "llm/v1/completions",
+    },
+}
+
+
+def test_evaluate_with_correctness():
+    metric = mlflow.metrics.make_genai_metric(
+        name="correctness",
+        definition=(
+            "Correctness refers to how well the generated output matches "
+            "or aligns with the reference or ground truth text that is considered "
+            "accurate and appropriate for the given input. The ground truth serves as "
+            "a benchmark against which the provided output is compared to determine the "
+            "level of accuracy and fidelity."
+        ),
+        grading_prompt=(
+            "Correctness: If the answer correctly answer the question, below "
+            "are the details for different scores: "
+            "- Score 0: the answer is completely incorrect, doesn’t mention anything about "
+            "the question or is completely contrary to the correct answer. "
+            "- Score 1: the answer provides some relevance to the question and answer "
+            "one aspect of the question correctly. "
+            "- Score 2: the answer mostly answer the question but is missing or hallucinating "
+            "on one critical aspect. "
+            "- Score 4: the answer correctly answer the question and not missing any "
+            "major aspect"
+        ),
+        examples=[],
+        version="v1",
+        model="openai:/gpt-3.5-turbo-16k",
+        grading_context_columns=["ground_truth"],
+        parameters={"temperature": 0.0},
+        aggregations=["mean", "variance", "p90"],
+        greater_is_better=True,
+    )
+
+    with mock.patch.object(
+        model_utils,
+        "score_model_on_payload",
+        return_value=properly_formatted_openai_response1,
+    ):
+        with mlflow.start_run():
+            eval_df = pd.DataFrame(
+                {
+                    "inputs": [
+                        "What is MLflow?",
+                        "What is Spark?",
+                        "What is Python?",
+                    ],
+                    "ground_truth": [
+                        "MLflow is an open-source platform",
+                        "Apache Spark is an open-source, distributed computing system",
+                        "Python is a high-level programming language",
+                    ],
+                    "prediction": [
+                        "MLflow is an open-source platform",
+                        "Apache Spark is an open-source, distributed computing system",
+                        "Python is a high-level programming language",
+                    ],
+                }
+            )
+            results = mlflow.evaluate(
+                data=eval_df,
+                evaluators="default",
+                targets="ground_truth",
+                predictions="prediction",
+                extra_metrics=[metric],
+            )
+
+            assert results.metrics == {
+                "correctness/v1/mean": 3.0,
+                "correctness/v1/variance": 0.0,
+                "correctness/v1/p90": 3.0,
+            }
+
+
+def test_evaluate_custom_metrics_string_values():
+    with mlflow.start_run():
+        model_info = mlflow.pyfunc.log_model(
+            artifact_path="model", python_model=language_model, input_example=["a", "b"]
+        )
+        data = pd.DataFrame({"text": ["Hello world", "My name is MLflow"]})
+        results = mlflow.evaluate(
+            model_info.model_uri,
+            data,
+            extra_metrics=[
+                make_metric(
+                    eval_fn=lambda predictions, metrics, eval_config: MetricValue(
+                        aggregate_results={"eval_config_value_average": eval_config}
+                    ),
+                    name="cm",
+                    greater_is_better=True,
+                    long_name="custom_metric",
+                )
+            ],
+            evaluators="default",
+            evaluator_config={"eval_config": 3},
+        )
+        assert results.metrics["cm/eval_config_value_average"] == 3
+
+
+def validate_retriever_logged_data(logged_data):
+    columns = {
+        "question",
+        "outputs",  # TODO: fix the logged data to name the model output column "retrieved_context"
+        # Right now, it's hard-coded "outputs", which is not ideal
+        "precision_at_k/v1/score",
+        "ground_truth",
+    }
+
+    assert set(logged_data.columns.tolist()) == columns
+
+    assert logged_data["question"].tolist() == ["q1?", "q1?", "q1?"]
+    assert logged_data["outputs"].tolist() == [["doc1", "doc3", "doc2"]] * 3
+    assert (logged_data["precision_at_k/v1/score"] <= 1).all()
+    assert logged_data["ground_truth"].tolist() == [["doc1", "doc2"]] * 3
+
+
+def test_evaluate_retriever():
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+
+    def fn(X):
+        return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)})
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            model=fn,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluators="default",
+            evaluator_config={
+                "k": 3,
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 2 / 3,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 2 / 3,
+    }
+    client = mlflow.MlflowClient()
+    artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
+    assert "eval_results_table.json" in artifacts
+    logged_data = pd.DataFrame(**results.artifacts["eval_results_table"].content)
+    validate_retriever_logged_data(logged_data)
+    assert set(results.metrics.keys()) == {
+        "precision_at_k/v1/p90",
+        "precision_at_k/v1/mean",
+        "precision_at_k/v1/variance",
+    }
+    assert results.metrics["precision_at_k/v1/p90"] == 2 / 3
+    assert results.metrics["precision_at_k/v1/mean"] == 2 / 3
+    assert results.metrics["precision_at_k/v1/variance"] == 0
+
+    # test with a big k to ensure we use min(k, len(retrieved_chunks))
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluators="default",
+            evaluator_config={
+                "k": 6,
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 2 / 3,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 2 / 3,
+    }
+
+    # test with default k
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 2 / 3,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 2 / 3,
+    }
+
+    # test with multiple chunks from same doc
+    def fn2(X):
+        return pd.DataFrame({"retrieved_context": [("doc1", "doc1", "doc3")] * len(X)})
+
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc3")] * 3})
+
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn2,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluator_config={
+                "default": {
+                    "k": 3,
+                }
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 1,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 1,
+    }
+
+    # test with empty retrieved doc
+    def fn3(X):
+        return pd.DataFrame({"output": [()] * len(X)})
+
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn3,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluator_config={
+                "default": {
+                    "k": 3,
+                }
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 1,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 1,
+    }
+
+    # test with single retrieved doc
+    def fn4(X):
+        return pd.DataFrame({"output": [("doc1")] * len(X)})
+
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn4,
+            data=X,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluator_config={
+                "default": {
+                    "k": 3,
+                }
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 1,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 1,
+    }
+
+    # test with single ground truth doc
+    X_1 = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1")] * 3})
+
+    with mlflow.start_run() as run:
+        mlflow.evaluate(
+            model=fn,
+            data=X_1,
+            targets="ground_truth",
+            model_type="retriever",
+            evaluator_config={
+                "default": {
+                    "k": 3,
+                }
+            },
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 1 / 3,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 1 / 3,
+    }
+
+
+def test_evaluate_precision_at_k_no_model_type():
+    X = pd.DataFrame({"question": ["q1?"] * 3, "ground_truth": [("doc1", "doc2")] * 3})
+
+    def fn(X):
+        return pd.DataFrame({"retrieved_context": [("doc1", "doc3", "doc2")] * len(X)})
+
+    with mlflow.start_run() as run:
+        results = mlflow.evaluate(
+            model=fn,
+            data=X,
+            targets="ground_truth",
+            extra_metrics=[mlflow.metrics.precision_at_k(3)],
+        )
+    run = mlflow.get_run(run.info.run_id)
+    assert run.data.metrics == {
+        "precision_at_k/v1/mean": 2 / 3,
+        "precision_at_k/v1/variance": 0,
+        "precision_at_k/v1/p90": 2 / 3,
+    }
+    client = mlflow.MlflowClient()
+    artifacts = [a.path for a in client.list_artifacts(run.info.run_id)]
+    assert "eval_results_table.json" in artifacts
+    logged_data = pd.DataFrame(**results.artifacts["eval_results_table"].content)
+    validate_retriever_logged_data(logged_data)
+    assert set(results.metrics.keys()) == {
+        "precision_at_k/v1/p90",
+        "precision_at_k/v1/mean",
+        "precision_at_k/v1/variance",
+    }
+    assert results.metrics["precision_at_k/v1/p90"] == 2 / 3
+    assert results.metrics["precision_at_k/v1/mean"] == 2 / 3
+    assert results.metrics["precision_at_k/v1/variance"] == 0
+
+
+def test_evaluate_with_numpy_array():
+    data = [
+        ["What is MLflow?"],
+    ]
+    ground_truth = [
+        "MLflow is an open-source platform for managing the end-to-end machine learning",
+    ]
+
+    with mlflow.start_run():
+        logged_model = mlflow.pyfunc.log_model(
+            artifact_path="model", python_model=language_model, input_example=["a", "b"]
+        )
+        results = mlflow.evaluate(
+            logged_model.model_uri,
+            data,
+            targets=ground_truth,
+            extra_metrics=[mlflow.metrics.toxicity()],
+        )
+
+        assert results.metrics.keys() == {
+            "toxicity/v1/mean",
+            "toxicity/v1/variance",
+            "toxicity/v1/p90",
+            "toxicity/v1/ratio",
+        }
+        assert len(results.tables) == 1
+        assert results.tables["eval_results_table"].columns.tolist() == [
+            "feature_1",
+            "target",
+            "outputs",
+            "toxicity/v1/score",
+        ]
+
+
+def test_target_prediction_col_mapping():
+    metric = mlflow.metrics.make_genai_metric(
+        name="correctness",
+        definition=(
+            "Correctness refers to how well the generated output matches "
+            "or aligns with the reference or ground truth text that is considered "
+            "accurate and appropriate for the given input. The ground truth serves as "
+            "a benchmark against which the provided output is compared to determine the "
+            "level of accuracy and fidelity."
+        ),
+        grading_prompt=(
+            "Correctness: If the answer correctly answer the question, below "
+            "are the details for different scores: "
+            "- Score 0: the answer is completely incorrect, doesn't mention anything about "
+            "the question or is completely contrary to the correct answer. "
+            "- Score 1: the answer provides some relevance to the question and answer "
+            "one aspect of the question correctly. "
+            "- Score 2: the answer mostly answer the question but is missing or hallucinating "
+            "on one critical aspect. "
+            "- Score 3: the answer correctly answer the question and not missing any "
+            "major aspect"
+        ),
+        examples=[],
+        version="v1",
+        model="openai:/gpt-4",
+        grading_context_columns=["renamed_ground_truth"],
+        parameters={"temperature": 0.0},
+        aggregations=["mean", "variance", "p90"],
+        greater_is_better=True,
+    )
+
+    with mock.patch.object(
+        model_utils,
+        "score_model_on_payload",
+        return_value=properly_formatted_openai_response1,
+    ):
+        with mlflow.start_run():
+            eval_df = pd.DataFrame(
+                {
+                    "inputs": [
+                        "What is MLflow?",
+                        "What is Spark?",
+                        "What is Python?",
+                    ],
+                    "ground_truth": [
+                        "MLflow is an open-source platform",
+                        "Apache Spark is an open-source, distributed computing system",
+                        "Python is a high-level programming language",
+                    ],
+                    "prediction": [
+                        "MLflow is an open-source platform",
+                        "Apache Spark is an open-source, distributed computing system",
+                        "Python is a high-level programming language",
+                    ],
+                }
+            )
+            results = mlflow.evaluate(
+                data=eval_df,
+                evaluators="default",
+                targets="renamed_ground_truth",
+                predictions="prediction",
+                extra_metrics=[metric],
+                evaluator_config={"col_mapping": {"renamed_ground_truth": "ground_truth"}},
+            )
+
+            assert results.metrics == {
+                "correctness/v1/mean": 3.0,
+                "correctness/v1/variance": 0.0,
+                "correctness/v1/p90": 3.0,
+            }
diff --git a/tests/evaluate/test_evaluation.py b/tests/evaluate/test_evaluation.py
index b9203dd5d6f2d..01d4d603bef97 100644
--- a/tests/evaluate/test_evaluation.py
+++ b/tests/evaluate/test_evaluation.py
@@ -1,7 +1,7 @@
-import hashlib
 import io
 import json
 import os
+import re
 import signal
 import uuid
 from collections import namedtuple
@@ -55,6 +55,7 @@
 from mlflow.pyfunc import _ServedPyFuncModel
 from mlflow.pyfunc.scoring_server.client import ScoringServerClient
 from mlflow.tracking.artifact_utils import get_artifact_uri
+from mlflow.utils import insecure_hash
 from mlflow.utils.file_utils import TempDir
 
 
@@ -605,7 +606,6 @@ def test_pandas_df_regressor_evaluation_mlflow_dataset_with_metric_prefix(
         eval_result = evaluate(
             linear_regressor_model_uri,
             data=mlflow_df,
-            targets="y",
             model_type="regressor",
             evaluators=["default"],
             evaluator_config={
@@ -633,7 +633,6 @@ def test_pandas_df_regressor_evaluation_mlflow_dataset(linear_regressor_model_ur
         eval_result = evaluate(
             linear_regressor_model_uri,
             data=mlflow_df,
-            targets="y",
             model_type="regressor",
             evaluators=["default"],
         )
@@ -671,25 +670,6 @@ def test_pandas_df_regressor_evaluation_mlflow_dataset_with_targets_from_dataset
     assert len(datasets[0].tags) == 0
 
 
-def test_pandas_df_regressor_evaluation_mlflow_dataset_without_targets(linear_regressor_model_uri):
-    data = sklearn.datasets.load_diabetes()
-    df = pd.DataFrame(data.data, columns=data.feature_names)
-    df["y"] = data.target
-    mlflow_df = from_pandas(df=df, source="my_src")
-    with mlflow.start_run():
-        with pytest.raises(
-            MlflowException,
-            match="The targets argument is required when data is a Dataset and does not define "
-            "targets.",
-        ):
-            evaluate(
-                linear_regressor_model_uri,
-                data=mlflow_df,
-                model_type="regressor",
-                evaluators=["default"],
-            )
-
-
 def test_dataset_name():
     X, y = get_iris()
     d1 = EvaluationDataset(data=X, targets=y, name="a1")
@@ -710,7 +690,7 @@ def test_dataset_metadata():
 
 def test_gen_md5_for_arraylike_obj():
     def get_md5(data):
-        md5_gen = hashlib.md5()
+        md5_gen = insecure_hash.md5()
         _gen_md5_for_arraylike_obj(md5_gen, data)
         return md5_gen.hexdigest()
 
@@ -930,6 +910,7 @@ def test_evaluator_evaluation_interface(multiclass_logistic_regressor_model_uri,
                     extra_metrics=None,
                     custom_artifacts=None,
                     baseline_model=None,
+                    predictions=None,
                 )
 
 
@@ -1011,6 +992,7 @@ def get_evaluate_call_arg(model, evaluator_config):
                 "custom_metrics": None,
                 "custom_artifacts": None,
                 "baseline_model": baseline_model,
+                "predictions": None,
             }
 
         # evaluators = None is the case evaluators unspecified, it should fetch all registered
@@ -1317,6 +1299,105 @@ def test_evaluate_lightgbm_regressor():
     assert "root_mean_squared_error" in run.data.metrics
 
 
+def test_evaluate_with_targets_error_handling():
+    import lightgbm as lgb
+
+    X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
+    X = X[::5]
+    y = y[::5]
+    lgb_data = lgb.Dataset(X, label=y)
+    model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5)
+    ERROR_TYPE_1 = (
+        "The top-level targets parameter should not be specified since a Dataset "
+        "is used. Please only specify the targets column name in the Dataset. For example: "
+        "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`. "
+        "Meanwhile, please specify `mlflow.evaluate(..., targets=None, ...)`."
+    )
+    ERROR_TYPE_2 = (
+        "The targets column name must be specified in the provided Dataset "
+        "for regressor models. For example: "
+        "`data = mlflow.data.from_pandas(df=X.assign(y=y), targets='y')`"
+    )
+    ERROR_TYPE_3 = "The targets argument must be specified for regressor models."
+
+    pandas_dataset_no_targets = X
+    mlflow_dataset_no_targets = mlflow.data.from_pandas(df=X.assign(y=y))
+    mlflow_dataset_with_targets = mlflow.data.from_pandas(df=X.assign(y=y), targets="y")
+
+    with mlflow.start_run():
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_with_targets,
+                model_type="regressor",
+                targets="y",
+            )
+
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_no_targets,
+                model_type="regressor",
+                targets="y",
+            )
+
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_with_targets,
+                model_type="question-answering",
+                targets="y",
+            )
+
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_1)):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_no_targets,
+                model_type="question-answering",
+                targets="y",
+            )
+
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_2)):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_no_targets,
+                model_type="regressor",
+            )
+
+        with pytest.raises(MlflowException, match=re.escape(ERROR_TYPE_3)):
+            mlflow.evaluate(
+                model=model,
+                data=pandas_dataset_no_targets,
+                model_type="regressor",
+            )
+
+
+def test_evaluate_with_predictions_error_handling():
+    import lightgbm as lgb
+
+    X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
+    X = X[::5]
+    y = y[::5]
+    lgb_data = lgb.Dataset(X, label=y)
+    model = lgb.train({"objective": "regression"}, lgb_data, num_boost_round=5)
+    mlflow_dataset_with_predictions = mlflow.data.from_pandas(
+        df=X.assign(y=y, model_output=y),
+        targets="y",
+        predictions="model_output",
+    )
+    with mlflow.start_run():
+        with pytest.raises(
+            MlflowException,
+            match="The predictions parameter should not be specified in the Dataset since a model "
+            "is specified. Please remove the predictions column from the Dataset.",
+        ):
+            mlflow.evaluate(
+                model=model,
+                data=mlflow_dataset_with_predictions,
+                model_type="regressor",
+            )
+
+
 def test_evaluate_with_function_input_single_output():
     import lightgbm as lgb
 
@@ -1403,20 +1484,6 @@ def test_evaluate_with_static_mlflow_dataset_input():
     assert "mean_squared_error" in run.data.metrics
     assert "root_mean_squared_error" in run.data.metrics
 
-    # redundent predictions parameter is allowed
-    with mlflow.start_run() as run:
-        mlflow.evaluate(
-            data=data,
-            model_type="regressor",
-            targets="y",
-            predictions="model_output",  # same as data.predictions
-        )
-
-    run = mlflow.get_run(run.info.run_id)
-    assert "mean_absolute_error" in run.data.metrics
-    assert "mean_squared_error" in run.data.metrics
-    assert "root_mean_squared_error" in run.data.metrics
-
 
 def test_evaluate_with_static_spark_dataset_unsupported():
     data = sklearn.datasets.load_diabetes()
@@ -1470,18 +1537,6 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataframe():
                 model_type="regressor",
             )
 
-        with pytest.raises(
-            MlflowException,
-            match="The predictions argument cannot be specified when model is specified.",
-        ):
-            mlflow.evaluate(
-                model="models:/test",
-                data=X.assign(y=y, model_output=y).to_numpy(),
-                targets="y",
-                predictions="model_output",
-                model_type="regressor",
-            )
-
         with pytest.raises(MlflowException, match="The data argument cannot be None."):
             mlflow.evaluate(
                 data=None,
@@ -1491,8 +1546,8 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataframe():
 
         with pytest.raises(
             MlflowException,
-            match="The specified predictions column 'prediction' is not "
-            "found in the specified data.",
+            match="The specified pandas DataFrame does not contain the specified predictions"
+            " column 'prediction'.",
         ):
             mlflow.evaluate(
                 data=X.assign(y=y, model_output=y),
@@ -1506,34 +1561,27 @@ def test_evaluate_with_static_dataset_error_handling_pandas_dataset():
     X, y = sklearn.datasets.load_diabetes(return_X_y=True, as_frame=True)
     X = X[::5]
     y = y[::5]
-    data = mlflow.data.from_pandas(
+    dataset_with_predictions = mlflow.data.from_pandas(
         df=X.assign(y=y, model_output=y), targets="y", predictions="model_output"
     )
+    dataset_no_predictions = mlflow.data.from_pandas(df=X.assign(y=y, model_output=y), targets="y")
+    ERROR_MESSAGE = (
+        "The top-level predictions parameter should not be specified since a Dataset is "
+        "used. Please only specify the predictions column name in the Dataset. For example: "
+        "`data = mlflow.data.from_pandas(df=X.assign(y=y), predictions='y')`"
+        "Meanwhile, please specify `mlflow.evaluate(..., predictions=None, ...)`."
+    )
     with mlflow.start_run():
-        with pytest.raises(
-            MlflowException,
-            match="The predictions parameter must be None or the same as "
-            "data.predictions when data.predictions is specified. Found predictions='y', "
-            "data.predictions='model_output'.",
-        ):
+        with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)):
             mlflow.evaluate(
-                data=data,
+                data=dataset_with_predictions,
                 model_type="regressor",
-                targets="y",
-                predictions="y",  # conflict with data.predictions
+                predictions="model_output",
             )
 
-    # data.predictions cannot be missing
-    data = mlflow.data.from_pandas(df=X.assign(y=y, model_output=y), targets="y")
-    with mlflow.start_run():
-        with pytest.raises(
-            MlflowException,
-            match="The predictions parameter must be specified with the "
-            "provided PandasDataset when model=None.",
-        ):
+        with pytest.raises(MlflowException, match=re.escape(ERROR_MESSAGE)):
             mlflow.evaluate(
-                data=data,
+                data=dataset_no_predictions,
                 model_type="regressor",
-                targets="y",
                 predictions="model_output",
             )
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
index 18a95257309db..14f041a442621 100644
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@@ -11,7 +11,7 @@
 from mlflow.utils import process
 from mlflow.utils.virtualenv import _get_mlflow_virtualenv_root
 
-from tests.helper_functions import clear_hub_cache, get_free_disk_space_in_GiB
+from tests.helper_functions import clear_hub_cache
 from tests.integration.utils import invoke_cli_runner
 
 EXAMPLES_DIR = "examples"
@@ -28,14 +28,6 @@ def replace_mlflow_with_dev_version(yml_path: Path) -> None:
     yml_path.write_text(new_src)
 
 
-@pytest.fixture(autouse=True)
-def report_free_disk_space(capsys):
-    yield
-
-    with capsys.disabled():
-        sys.stdout.write(f" | Free disk space: {get_free_disk_space_in_GiB():.1f} GiB")
-
-
 @pytest.fixture(autouse=True)
 def clean_up_mlflow_virtual_environments():
     yield
diff --git a/tests/helper_functions.py b/tests/helper_functions.py
index 79dd26f696e67..1de4d1b5ceba5 100644
--- a/tests/helper_functions.py
+++ b/tests/helper_functions.py
@@ -4,7 +4,6 @@
 import numbers
 import os
 import random
-import shutil
 import signal
 import socket
 import subprocess
@@ -636,7 +635,3 @@ def clear_hub_cache():
     except ImportError:
         # Local import check for mlflow-skinny not including huggingface_hub
         pass
-
-
-def get_free_disk_space_in_GiB():
-    return shutil.disk_usage("/").free / (1024**3)
diff --git a/tests/integration/async_logging/test_async_logging_integration.py b/tests/integration/async_logging/test_async_logging_integration.py
new file mode 100644
index 0000000000000..eba6674c2cc2e
--- /dev/null
+++ b/tests/integration/async_logging/test_async_logging_integration.py
@@ -0,0 +1,229 @@
+import io
+import pickle
+import time
+import uuid
+
+import mlflow
+from mlflow import MlflowClient
+from mlflow.entities.metric import Metric
+from mlflow.entities.param import Param
+from mlflow.entities.run_tag import RunTag
+
+
+def test_async_logging_mlflow_client_pickle():
+    experiment_name = f"mlflow-async-logging-pickle-test-{str(uuid.uuid4())[:8]}"
+    mlflow_client = MlflowClient()
+
+    buffer = io.BytesIO()
+    pickle.dump(mlflow_client, buffer)
+
+    deserialized_mlflow_client = pickle.loads(buffer.getvalue())  # Type: MlflowClient
+    experiment_id = deserialized_mlflow_client.create_experiment(experiment_name)
+
+    run = deserialized_mlflow_client.create_run(experiment_id=experiment_id)
+    run_id = run.info.run_id
+
+    run_operations = []
+
+    params_to_log = []
+    param1 = Param("async param 1", "async param 1 value")
+    run_operations.append(
+        mlflow_client.log_param(run_id, param1.key, param1.value, synchronous=False)
+    )
+    params_to_log.append(param1)
+
+    for run_operation in run_operations:
+        run_operation.wait()
+    run = mlflow_client.get_run(run_id)
+    assert param1.key in run.data.params
+    assert param1.value == run.data.params[param1.key]
+
+
+def test_async_logging_mlflow_client():
+    experiment_name = f"mlflow-async-logging-test-{str(uuid.uuid4())[:8]}"
+    mlflow_client = MlflowClient()
+    experiment_id = mlflow_client.create_experiment(experiment_name)
+
+    run = mlflow_client.create_run(experiment_id=experiment_id)
+    run_id = run.info.run_id
+
+    run_operations = []
+
+    params_to_log = []
+    param1 = Param("async param 1", "async param 1 value")
+    run_operations.append(
+        mlflow_client.log_param(run_id, param1.key, param1.value, synchronous=False)
+    )
+    params_to_log.append(param1)
+
+    tags_to_log = []
+    tag1 = RunTag("async tag 1", "async tag 1 value")
+    run_operations.append(mlflow_client.set_tag(run_id, tag1.key, tag1.value, synchronous=False))
+    tags_to_log.append(tag1)
+
+    metrics_to_log = []
+    metric1 = Metric("async metric 1", 1, 132, 0)
+    run_operations.append(
+        mlflow_client.log_metric(
+            run_id, metric1.key, metric1.value, metric1.timestamp, metric1.step, synchronous=False
+        )
+    )
+    metrics_to_log.append(metric1)
+
+    # Log batch of metrics
+    metric_value = 1
+    for _ in range(1, 5):
+        metrics = []
+        guid8 = str(uuid.uuid4())[:8]
+        params = [Param(f"batch param-{guid8}-{val}", value=str(val)) for val in range(1)]
+        tags = [RunTag(f"batch tag-{guid8}-{val}", value=str(val)) for val in range(1)]
+        for _ in range(0, 50):
+            metric_value += 1
+            metrics.append(
+                Metric(
+                    key=f"batch metrics async-{metric_value}",
+                    value=time.time(),
+                    timestamp=metric_value,
+                    step=0,
+                )
+            )
+
+        params_to_log.extend(params)
+        tags_to_log.extend(tags)
+        metrics_to_log.extend(metrics)
+        run_operation = mlflow_client.log_batch(
+            run_id,
+            params=params,
+            tags=tags,
+            metrics=metrics,
+            synchronous=False,
+        )
+        run_operations.append(run_operation)
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    run = mlflow_client.get_run(run_id)
+    for tag in tags_to_log:
+        assert tag.key in run.data.tags
+        assert tag.value == run.data.tags[tag.key]
+    for param in params_to_log:
+        assert param.key in run.data.params
+        assert param.value == run.data.params[param.key]
+    for metric in metrics_to_log:
+        assert metric.key in run.data.metrics
+        assert metric.value == run.data.metrics[metric.key]
+
+    mlflow_client.set_terminated(run_id=run_id, status="FINISHED", end_time=time.time())
+
+
+def test_async_logging_fluent():
+    experiment_name = f"mlflow-async-logging-test-{str(uuid.uuid4())[:8]}"
+    experiment_id = mlflow.create_experiment(experiment_name)
+
+    run_operations = []
+
+    with mlflow.start_run(experiment_id=experiment_id) as run:
+        run_id = run.info.run_id
+        params_to_log = []
+        param1 = Param("async param 1", "async param 1 value")
+        run_operations.append(mlflow.log_param(param1.key, param1.value, synchronous=False))
+        params_to_log.append(param1)
+
+        tags_to_log = []
+        tag1 = RunTag("async tag 1", "async tag 1 value")
+        run_operations.append(mlflow.set_tag(tag1.key, tag1.value, synchronous=False))
+        tags_to_log.append(tag1)
+
+        metrics_to_log = []
+        metric1 = Metric("async metric 1", 1, 432, 0)
+        run_operations.append(mlflow.log_metric(metric1.key, metric1.value, synchronous=False))
+        metrics_to_log.append(metric1)
+
+        # Log batch of metrics
+        metric_value = 1
+        for _ in range(1, 5):
+            metrics = []
+            guid8 = str(uuid.uuid4())[:8]
+            params = [Param(f"batch param-{guid8}-{val}", value=str(val)) for val in range(5)]
+            tags = [RunTag(f"batch tag-{guid8}-{val}", value=str(val)) for val in range(5)]
+            for _ in range(0, 50):
+                metric_value += 1
+                metrics.append(
+                    Metric(
+                        key=f"batch metrics async-{metric_value}",
+                        value=time.time(),
+                        timestamp=metric_value,
+                        step=0,
+                    )
+                )
+
+            params_to_log.extend(params)
+            run_operation = mlflow.log_params(
+                params={param.key: param.value for param in params},
+                synchronous=False,
+            )
+            run_operations.append(run_operation)
+
+            tags_to_log.extend(tags)
+            run_operation = mlflow.set_tags(
+                tags={tag.key: tag.value for tag in tags},
+                synchronous=False,
+            )
+            run_operations.append(run_operation)
+
+            metrics_to_log.extend(metrics)
+            run_operation = mlflow.log_metrics(
+                metrics={metric.key: metric.value for metric in metrics},
+                step=1,
+                synchronous=False,
+            )
+            run_operations.append(run_operation)
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    run = mlflow.run
+    run = mlflow.get_run(run_id)
+    for tag in tags_to_log:
+        assert tag.key in run.data.tags
+        assert tag.value == run.data.tags[tag.key]
+    for param in params_to_log:
+        assert param.key in run.data.params
+        assert param.value == run.data.params[param.key]
+    for metric in metrics_to_log:
+        assert metric.key in run.data.metrics
+        assert metric.value == run.data.metrics[metric.key]
+
+
+def test_async_logging_fluent_check_batch_split():
+    # Check that batch is split into multiple requests if it exceeds the maximum size
+    # and if we wait for RunOperations returned then at the end everything should be logged.
+    experiment_name = f"mlflow-async-logging-test-{str(uuid.uuid4())[:8]}"
+    experiment_id = mlflow.create_experiment(experiment_name)
+
+    run_operations = []
+
+    with mlflow.start_run(experiment_id=experiment_id) as run:
+        run_id = run.info.run_id
+
+        metrics_to_log = {
+            f"batch metrics async-{metric_value}": metric_value for metric_value in range(0, 10000)
+        }
+
+        run_operations = mlflow.log_metrics(
+            metrics=metrics_to_log,
+            step=1,
+            synchronous=False,
+        )
+
+        run_operations.wait()
+
+    # Total 10000 metrics logged, max batch size =1000, so 10 requests will be sent.
+    assert len(run_operations._operation_futures) == 10
+
+    run = mlflow.run
+    run = mlflow.get_run(run_id)
+    for metric_key, metric_value in metrics_to_log.items():
+        assert metric_key in run.data.metrics
+        assert metric_value == run.data.metrics[metric_key]
diff --git a/tests/langchain/test_langchain_model_export.py b/tests/langchain/test_langchain_model_export.py
index f0cabbb757b41..305420bf70d19 100644
--- a/tests/langchain/test_langchain_model_export.py
+++ b/tests/langchain/test_langchain_model_export.py
@@ -12,6 +12,7 @@
 import pytest
 import transformers
 from langchain import SQLDatabase
+from langchain.agents import AgentType, initialize_agent
 from langchain.chains import (
     APIChain,
     ConversationChain,
@@ -32,6 +33,7 @@
 from langchain.prompts import PromptTemplate
 from langchain.requests import TextRequestsWrapper
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.tools import Tool
 from langchain.vectorstores import FAISS
 from langchain_experimental.sql import SQLDatabaseChain
 from packaging import version
@@ -736,3 +738,29 @@ def test_unsupported_class():
     ):
         with mlflow.start_run():
             mlflow.langchain.log_model(llm, "fake_llm")
+
+
+def test_agent_with_unpicklable_tools(tmp_path):
+    tmp_file = tmp_path / "temp_file.txt"
+    with open(tmp_file, mode="w") as temp_file:
+        # files that aren't opened for reading cannot be pickled
+        tools = [
+            Tool.from_function(
+                func=lambda: temp_file,
+                name="Write 0",
+                description="If you need to write 0 to a file",
+            )
+        ]
+        agent = initialize_agent(
+            llm=OpenAI(temperature=0), tools=tools, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION
+        )
+
+        with pytest.raises(
+            MlflowException,
+            match=(
+                "Error when attempting to pickle the AgentExecutor tools. "
+                "This model likely does not support serialization."
+            ),
+        ):
+            with mlflow.start_run():
+                mlflow.langchain.log_model(agent, "unpicklable_tools")
diff --git a/tests/metrics/genai/prompts/test_v1.py b/tests/metrics/genai/prompts/test_v1.py
index 57af1c669e92a..5356cec852264 100644
--- a/tests/metrics/genai/prompts/test_v1.py
+++ b/tests/metrics/genai/prompts/test_v1.py
@@ -41,30 +41,39 @@ def test_evaluation_model_output():
             ),
         ],
         model="gateway:/gpt-4",
-        parameters={"temperature": 1.0},
+        parameters={"temperature": 0.0},
     ).to_dict()
 
     assert model1["model"] == "gateway:/gpt-4"
-    assert model1["parameters"] == {"temperature": 1.0}
+    assert model1["parameters"] == {"temperature": 0.0}
 
     grading_context = {"ground_truth": "This is an output"}
-    args_string = "\n".join(
-        [f"Provided {arg}: {arg_value}" for arg, arg_value in grading_context.items()]
+    args_string = "Additional information used by the model:\n" + "\n".join(
+        [f"key: {arg}\nvalue:\n{arg_value}" for arg, arg_value in grading_context.items()]
     )
     expected_prompt1 = """
-    Please act as an impartial judge and evaluate the quality of the provided output which
-    attempts to produce output for the provided input based on a provided information.
-    You'll be given a grading format below which you'll call for each provided information,
-    input and provided output to submit your justification and score to compute the correctness of
-    the output.
+    Task:
+    You are an impartial judge. You will be given an input that was sent to a machine
+    learning model, and you will be given an output that the model produced. You
+    may also be given additional information that was used by the model to generate the output.
+
+    Your task is to determine a numerical score called correctness based on the input and output.
+    A definition of correctness and a grading rubric are provided below.
+    You must use the grading rubric to determine your score. You must also justify your score.
+
+    Examples could be included below for reference. Make sure to use them as references and to
+    understand them before completing the task.
 
     Input:
     This is an input
 
-    Provided output:
+    Output:
     This is an output
 
-    Provided ground_truth: This is an output
+    Additional information used by the model:
+    key: ground_truth
+    value:
+    This is an output
 
     Metric definition:
     Correctness refers to how well the generated output matches or aligns with the reference or
@@ -72,7 +81,7 @@ def test_evaluation_model_output():
     truth serves as a benchmark against which the provided output is compared to determine the
     level of accuracy and fidelity.
 
-    Below is your grading criteria:
+    Grading rubric:
     Correctness: If the answer correctly answer the question, below are the details for
     different scores:
         - Score 1: the answer is completely incorrect, doesn’t mention anything about the
@@ -84,22 +93,38 @@ def test_evaluation_model_output():
         - Score 5: the answer correctly answer the question and not missing any major aspect
 
     Examples:
-        Input: This is an input
-        Provided output: This is an output
-        Provided ground_truth: This is an output
-        Score: 4
-        Justification: This is a justification
-
-        Input: This is an example input 2
-        Provided output: This is an example output 2
-        Provided ground_truth: This is an output
-        Score: 4
-        Justification: This is an example justification 2
-
-    And you'll need to submit your grading for the correctness of the output,
-    using the following in json format:
-    Score: [your score number for the correctness of the output]
-    Justification: [your step by step reasoning about the correctness of the output]
+        Input:
+        This is an input
+
+        Output:
+        This is an output
+
+        Additional information used by the model:
+        key: ground_truth
+        value:
+        This is an output
+
+        score: 4
+        justification: This is a justification
+
+
+        Input:
+        This is an example input 2
+
+        Output:
+        This is an example output 2
+
+        Additional information used by the model:
+        key: ground_truth
+        value:
+        This is an output
+
+        score: 4
+        justification: This is an example justification 2
+
+    You must return the following fields in your response one below the other:
+    score: Your numerical score for the model's correctness based on the rubric
+    justification: Your step-by-step reasoning about the model's correctness score
       """
     prompt1 = model1["eval_prompt"].format(
         input="This is an input", output="This is an output", grading_context_columns=args_string
@@ -125,7 +150,7 @@ def test_evaluation_model_output():
         """,
     ).to_dict()
 
-    assert model2["model"] == "openai:/gpt-3.5-turbo-16k"
+    assert model2["model"] == "openai:/gpt-4"
     assert model2["parameters"] == {
         "temperature": 0.0,
         "max_tokens": 200,
@@ -133,16 +158,22 @@ def test_evaluation_model_output():
     }
     args_string = ""
     expected_prompt2 = """
-    Please act as an impartial judge and evaluate the quality of the provided output which
-    attempts to produce output for the provided input based on a provided information.
-    You'll be given a grading format below which you'll call for each provided information,
-    input and provided output to submit your justification and score to compute the correctness of
-    the output.
+    Task:
+    You are an impartial judge. You will be given an input that was sent to a machine
+    learning model, and you will be given an output that the model produced. You
+    may also be given additional information that was used by the model to generate the output.
+
+    Your task is to determine a numerical score called correctness based on the input and output.
+    A definition of correctness and a grading rubric are provided below.
+    You must use the grading rubric to determine your score. You must also justify your score.
+
+    Examples could be included below for reference. Make sure to use them as references and to
+    understand them before completing the task.
 
     Input:
     This is an input
 
-    Provided output:
+    Output:
     This is an output
 
     Metric definition:
@@ -151,7 +182,7 @@ def test_evaluation_model_output():
     truth serves as a benchmark against which the provided output is compared to determine the
     level of accuracy and fidelity.
 
-    Below is your grading criteria:
+    Grading rubric:
     Correctness: If the answer correctly answer the question, below are the details for different
     scores:
         - Score 1: the answer is completely incorrect, doesn’t mention anything about the question
@@ -162,10 +193,9 @@ def test_evaluation_model_output():
         critical aspect.
         - Score 5: the answer correctly answer the question and not missing any major aspect
 
-    And you'll need to submit your grading for the correctness of the output,
-    using the following in json format:
-    Score: [your score number for the correctness of the output]
-    Justification: [your step by step reasoning about the correctness of the output]
+    You must return the following fields in your response one below the other:
+    score: Your numerical score for the model's correctness based on the rubric
+    justification: Your step-by-step reasoning about the model's correctness score
       """
     prompt2 = model2["eval_prompt"].format(
         input="This is an input", output="This is an output", grading_context_columns=args_string
@@ -184,28 +214,33 @@ def test_no_examples(examples):
 
     args_string = ""
     expected_prompt2 = """
-    Please act as an impartial judge and evaluate the quality of the provided output which
-    attempts to produce output for the provided input based on a provided information.
-    You'll be given a grading format below which you'll call for each provided information,
-    input and provided output to submit your justification and score to compute the correctness of
-    the output.
+    Task:
+    You are an impartial judge. You will be given an input that was sent to a machine
+    learning model, and you will be given an output that the model produced. You
+    may also be given additional information that was used by the model to generate the output.
+
+    Your task is to determine a numerical score called correctness based on the input and output.
+    A definition of correctness and a grading rubric are provided below.
+    You must use the grading rubric to determine your score. You must also justify your score.
+
+    Examples could be included below for reference. Make sure to use them as references and to
+    understand them before completing the task.
 
     Input:
     This is an input
 
-    Provided output:
+    Output:
     This is an output
 
     Metric definition:
     definition
 
-    Below is your grading criteria:
+    Grading rubric:
     grading prompt
 
-    And you'll need to submit your grading for the correctness of the output,
-    using the following in json format:
-    Score: [your score number for the correctness of the output]
-    Justification: [your step by step reasoning about the correctness of the output]
+    You must return the following fields in your response one below the other:
+    score: Your numerical score for the model's correctness based on the rubric
+    justification: Your step-by-step reasoning about the model's correctness score
       """
     prompt2 = model["eval_prompt"].format(
         input="This is an input", output="This is an output", grading_context_columns=args_string
diff --git a/tests/metrics/genai/test_genai_metrics.py b/tests/metrics/genai/test_genai_metrics.py
index ec7b406f2a857..882a7f2131999 100644
--- a/tests/metrics/genai/test_genai_metrics.py
+++ b/tests/metrics/genai/test_genai_metrics.py
@@ -15,14 +15,16 @@
     make_genai_metric,
 )
 from mlflow.metrics.genai.metric_definitions import (
-    correctness,
-    relevance,
-    strict_correctness,
+    answer_correctness,
+    answer_relevance,
+    answer_similarity,
+    faithfulness,
 )
 from mlflow.metrics.genai.prompts.v1 import (
-    CorrectnessMetric,
-    RelevanceMetric,
-    StrictCorrectnessMetric,
+    AnswerCorrectnessMetric,
+    AnswerRelevanceMetric,
+    AnswerSimilarityMetric,
+    FaithfulnessMetric,
 )
 
 openai_justification1 = (
@@ -37,7 +39,7 @@
 properly_formatted_openai_response1 = {
     "candidates": [
         {
-            "text": '{\n  "Score": 3,\n  "Justification": "' f"{openai_justification1}" '"\n}',
+            "text": '{\n  "score": 3,\n  "justification": "' f"{openai_justification1}" '"\n}',
             "metadata": {"finish_reason": "stop"},
         }
     ],
@@ -53,7 +55,7 @@
 properly_formatted_openai_response2 = {
     "candidates": [
         {
-            "text": '{\n  "Score": 2,\n  "Justification": "The provided output gives a correct '
+            "text": '{\n  "score": 2,\n  "justification": "The provided output gives a correct '
             "and adequate explanation of what Apache Spark is, covering its main functions and "
             "components like Spark SQL, Spark Streaming, and MLlib. However, it misses a "
             "critical aspect, which is Spark's development as a response to the limitations "
@@ -78,7 +80,7 @@
 incorrectly_formatted_openai_response = {
     "candidates": [
         {
-            "text": "Score: 2\nJustification: \n\nThe provided output gives some relevant "
+            "text": "score: 2\njustification: \n\nThe provided output gives some relevant "
             "information about MLflow including its capabilities such as experiment tracking, "
             "model packaging, versioning, and deployment. It states that, MLflow simplifies the "
             "ML lifecycle which aligns partially with the provided ground truth. However, it "
@@ -179,7 +181,7 @@ def test_make_genai_metric_correct_response():
         examples=[mlflow_example],
         model="gateway:/gpt-3.5-turbo",
         grading_context_columns=["targets"],
-        parameters={"temperature": 1.0},
+        parameters={"temperature": 0.0},
         greater_is_better=True,
         aggregations=["mean", "variance", "p90"],
     )
@@ -226,7 +228,6 @@ def test_make_genai_metric_correct_response():
         model="openai:/gpt-3.5-turbo",
         grading_context_columns=["targets"],
         greater_is_better=True,
-        aggregations=None,
     )
     with mock.patch.object(
         model_utils,
@@ -242,20 +243,89 @@ def test_make_genai_metric_correct_response():
         assert mock_predict_function.call_count == 1
         assert mock_predict_function.call_args[0][0] == "openai:/gpt-3.5-turbo"
         assert mock_predict_function.call_args[0][1] == {
-            "prompt": "\nPlease act as an impartial judge and evaluate the quality of "
-            "the provided output which\nattempts to produce output for the provided input "
-            "based on a provided information.\n\nYou'll be given a grading format below which "
-            "you'll call for each provided information,\ninput and provided output to submit "
-            "your justification and score to compute the fake_metric of\nthe output."
-            "\n\nInput:\ninput\n\nProvided output:\nprediction\n\nProvided targets: "
-            "ground_truth\n\nMetric definition:\nFake metric definition\n\nBelow is your grading "
-            "criteria:\nFake metric grading prompt\n\nExamples:\n\nInput: example-input\n\n"
-            "Provided output: example-output\n\nProvided targets: example-ground_truth\n\n"
-            "Score: 4\nJustification: example-justification\n\n        \n\nAnd you'll need to "
-            "submit your grading for the fake_metric of the output,\nusing the following in json "
-            "format:\nScore: [your score number for the fake_metric of the "
-            "output]\nJustification: [your step by step reasoning about the fake_metric of the "
-            "output]\n    ",
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "fake_metric based on the input and output.\nA definition of "
+            "fake_metric and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
+            "\nInput:\ninput\n\nOutput:\nprediction\n\nAdditional information used by the model:\n"
+            "key: targets\nvalue:\nground_truth\n\nMetric definition:\nFake metric definition\n\n"
+            "Grading rubric:\nFake metric grading prompt\n\nExamples:\n\nInput:\nexample-input\n\n"
+            "Output:\nexample-output\n\nAdditional information used by the model:\nkey: targets\n"
+            "value:\nexample-ground_truth\n\nscore: 4\njustification: "
+            "example-justification\n        \n\nYou must return the following fields in your "
+            "response one below the other:\nscore: Your numerical score for the model's "
+            "fake_metric based on the rubric\njustification: Your step-by-step reasoning about "
+            "the model's fake_metric score\n    ",
+            "temperature": 0.0,
+            "max_tokens": 200,
+            "top_p": 1.0,
+        }
+        assert metric_value.scores == [3]
+        assert metric_value.justifications == [openai_justification1]
+        assert metric_value.aggregate_results == {"mean": 3.0, "p90": 3.0, "variance": 0.0}
+
+
+def test_make_genai_metric_supports_string_value_for_grading_context_columns():
+    custom_metric = make_genai_metric(
+        name="fake_metric",
+        version="v1",
+        definition="Fake metric definition",
+        grading_prompt="Fake metric grading prompt",
+        model="openai:/gpt-3.5-turbo",
+        grading_context_columns="targets",
+        greater_is_better=True,
+        examples=[
+            EvaluationExample(
+                input="example-input",
+                output="example-output",
+                score=4,
+                justification="example-justification",
+                grading_context={"targets": "example-ground_truth"},
+            )
+        ],
+    )
+
+    assert [
+        param.name for param in inspect.signature(custom_metric.eval_fn).parameters.values()
+    ] == ["predictions", "metrics", "inputs", "targets"]
+
+    with mock.patch.object(
+        model_utils,
+        "score_model_on_payload",
+        return_value=properly_formatted_openai_response1,
+    ) as mock_predict_function:
+        metric_value = custom_metric.eval_fn(
+            pd.Series(["prediction"]),
+            {},
+            pd.Series(["input"]),
+            pd.Series(["ground_truth"]),
+        )
+        assert mock_predict_function.call_count == 1
+        assert mock_predict_function.call_args[0][0] == "openai:/gpt-3.5-turbo"
+        assert mock_predict_function.call_args[0][1] == {
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "fake_metric based on the input and output.\nA definition of "
+            "fake_metric and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
+            "\nInput:\ninput\n\nOutput:\nprediction\n\nAdditional information used by the model:\n"
+            "key: targets\nvalue:\nground_truth\n\nMetric definition:\nFake metric definition\n\n"
+            "Grading rubric:\nFake metric grading prompt\n\nExamples:\n\nInput:\nexample-input\n\n"
+            "Output:\nexample-output\n\nAdditional information used by the model:\nkey: targets\n"
+            "value:\nexample-ground_truth\n\nscore: 4\njustification: "
+            "example-justification\n        \n\nYou must return the following fields in your "
+            "response one below the other:\nscore: Your numerical score for the model's "
+            "fake_metric based on the rubric\njustification: Your step-by-step reasoning about "
+            "the model's fake_metric score\n    ",
             "temperature": 0.0,
             "max_tokens": 200,
             "top_p": 1.0,
@@ -274,7 +344,7 @@ def test_make_genai_metric_incorrect_response():
         examples=[mlflow_example],
         model="gateway:/gpt-3.5-turbo",
         grading_context_columns=["targets"],
-        parameters={"temperature": 1.0},
+        parameters={"temperature": 0.0},
         greater_is_better=True,
         aggregations=["mean", "variance", "p90"],
     )
@@ -292,12 +362,52 @@ def test_make_genai_metric_incorrect_response():
         )
 
     assert metric_value.scores == [None]
-    assert metric_value.justifications == [None]
+    assert metric_value.justifications == [
+        f"Failed to extract score and justification. Raw output:"
+        f" {incorrectly_formatted_openai_response}"
+    ]
 
     assert np.isnan(metric_value.aggregate_results["mean"])
     assert np.isnan(metric_value.aggregate_results["variance"])
     assert metric_value.aggregate_results["p90"] is None
 
+    with mock.patch.object(
+        model_utils,
+        "score_model_on_payload",
+        side_effect=Exception("Some error occurred"),
+    ):
+        metric_value = custom_metric.eval_fn(
+            pd.Series([mlflow_prediction]),
+            {},
+            pd.Series(["What is MLflow?"]),
+            pd.Series([mlflow_ground_truth]),
+        )
+
+    assert metric_value.scores == [None]
+    assert metric_value.justifications == [
+        "Failed to score model on payload. Error: Some error occurred"
+    ]
+
+    assert np.isnan(metric_value.aggregate_results["mean"])
+    assert np.isnan(metric_value.aggregate_results["variance"])
+    assert metric_value.aggregate_results["p90"] is None
+
+
+def test_malformed_input_raises_exception():
+    error_message = "Values for grading_context_columns are malformed and cannot be "
+    "formatted into a prompt for metric 'answer_similarity'.\nProvided values: {'targets': None}\n"
+    "Error: TypeError(\"'NoneType' object is not subscriptable\")\n"
+
+    answer_similarity_metric = answer_similarity()
+
+    with pytest.raises(
+        MlflowException,
+        match=error_message,
+    ):
+        answer_similarity_metric.eval_fn(
+            pd.Series([mlflow_prediction]), {}, pd.Series([input]), None
+        )
+
 
 def test_make_genai_metric_multiple():
     custom_metric = make_genai_metric(
@@ -308,7 +418,7 @@ def test_make_genai_metric_multiple():
         examples=[mlflow_example],
         model="gateway:/gpt-3.5-turbo",
         grading_context_columns=["targets"],
-        parameters={"temperature": 1.0},
+        parameters={"temperature": 0.0},
         greater_is_better=True,
         aggregations=["mean", "variance", "p90"],
     )
@@ -374,18 +484,6 @@ def test_make_genai_metric_failure():
     )
     import pandas as pd
 
-    custom_metric1 = make_genai_metric(
-        name="correctness",
-        version="v-latest",
-        definition="definition",
-        grading_prompt="grading_prompt",
-        examples=[example],
-        model="model",
-        grading_context_columns=["targets"],
-        parameters={"temperature": 1.0},
-        greater_is_better=True,
-        aggregations=["mean"],
-    )
     with pytest.raises(
         MlflowException,
         match=re.escape(
@@ -393,11 +491,17 @@ def test_make_genai_metric_failure():
             "Please check the correctness of the version"
         ),
     ):
-        custom_metric1.eval_fn(
-            pd.Series(["predictions"]),
-            {},
-            pd.Series(["What is MLflow?"]),
-            pd.Series(["truth"]),
+        make_genai_metric(
+            name="correctness",
+            version="v-latest",
+            definition="definition",
+            grading_prompt="grading_prompt",
+            examples=[example],
+            model="model",
+            grading_context_columns=["targets"],
+            parameters={"temperature": 0.0},
+            greater_is_better=True,
+            aggregations=["mean"],
         )
 
     with mock.patch.object(
@@ -413,7 +517,7 @@ def test_make_genai_metric_failure():
             examples=[example],
             model="openai:/gpt-3.5-turbo",
             grading_context_columns=["targets"],
-            parameters={"temperature": 1.0},
+            parameters={"temperature": 0.0},
             greater_is_better=True,
             aggregations=["random-fake"],
         )
@@ -432,7 +536,9 @@ def test_make_genai_metric_failure():
 def test_format_args_string():
     variable_string = _format_args_string(["foo", "bar"], {"foo": ["foo"], "bar": ["bar"]}, 0)
 
-    assert variable_string == "Provided foo: foo\nProvided bar: bar"
+    assert variable_string == (
+        "Additional information used by the model:\nkey: foo\nvalue:\nfoo" "\nkey: bar\nvalue:\nbar"
+    )
 
     with pytest.raises(
         MlflowException,
@@ -446,7 +552,7 @@ def test_extract_score_and_justification():
         output={
             "candidates": [
                 {
-                    "text": '{"Score": 4, "Justification": "This is a justification"}',
+                    "text": '{"score": 4, "justification": "This is a justification"}',
                 }
             ]
         }
@@ -459,7 +565,7 @@ def test_extract_score_and_justification():
         output={
             "candidates": [
                 {
-                    "text": "Score: 2 \nJustification: This is a justification",
+                    "text": "score: 2 \njustification: This is a justification",
                 }
             ]
         }
@@ -482,7 +588,7 @@ def test_extract_score_and_justification():
         output={
             "candidates": [
                 {
-                    "text": '{"Score": "4", "Justification": "This is a justification"}',
+                    "text": '{"score": "4", "justification": "This is a justification"}',
                 }
             ]
         }
@@ -491,22 +597,25 @@ def test_extract_score_and_justification():
     assert score4 == 4
     assert justification4 == "This is a justification"
 
-    score5, justification5 = _extract_score_and_justification(
-        output={
-            "candidates": [
-                {
-                    "text": '{"Score": 4, "Justification": {"foo": "bar"}}',
-                }
-            ]
-        }
-    )
+    malformed_output = {
+        "candidates": [
+            {
+                "text": '{"score": 4, "justification": {"foo": "bar"}}',
+            }
+        ]
+    }
+
+    score5, justification5 = _extract_score_and_justification(output=malformed_output)
 
     assert score5 is None
-    assert justification5 is None
+    assert (
+        justification5
+        == f"Failed to extract score and justification. Raw output: {malformed_output}"
+    )
 
 
 def test_correctness_metric():
-    correctness_metric = correctness(
+    correctness_metric = answer_similarity(
         model="gateway:/gpt-3.5-turbo", metric_version="v1", examples=[mlflow_example]
     )
 
@@ -524,28 +633,33 @@ def test_correctness_metric():
         assert mock_predict_function.call_count == 1
         assert mock_predict_function.call_args[0][0] == "gateway:/gpt-3.5-turbo"
         assert mock_predict_function.call_args[0][1] == {
-            "prompt": "\nPlease act as an impartial judge and evaluate the quality of "
-            "the provided output which\nattempts to produce output for the provided input "
-            "based on a provided information.\n\nYou'll be given a grading format below which "
-            "you'll call for each provided information,\ninput and provided output to submit "
-            "your justification and score to compute the correctness of\nthe output.\n"
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "answer_similarity based on the input and output.\nA definition of "
+            "answer_similarity and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
             f"\nInput:\n{input}\n"
-            f"\nProvided output:\n{mlflow_prediction}\n"
-            f"\nProvided targets: {mlflow_ground_truth}\n"
-            f"\nMetric definition:\n{CorrectnessMetric.definition}\n"
-            f"\nBelow is your grading criteria:\n{CorrectnessMetric.grading_prompt}\n"
+            f"\nOutput:\n{mlflow_prediction}\n"
+            "\nAdditional information used by the model:\nkey: targets\nvalue:\n"
+            f"{mlflow_ground_truth}\n"
+            f"\nMetric definition:\n{AnswerSimilarityMetric.definition}\n"
+            f"\nGrading rubric:\n{AnswerSimilarityMetric.grading_prompt}\n"
             "\nExamples:\n"
-            f"\nInput: {mlflow_example.input}\n"
-            f"\nProvided output: {mlflow_example.output}\n"
-            f"\nProvided targets: {mlflow_ground_truth}\n"
-            f"\nScore: {mlflow_example.score}\n"
-            f"Justification: {mlflow_example.justification}\n\n        \n\n"
-            "And you'll need to submit your grading for the correctness of the output,"
-            "\nusing the following in json format:\n"
-            "Score: [your score number for the correctness of the output]\n"
-            "Justification: [your step by step reasoning about the correctness of the output]"
-            "\n    ",
-            **CorrectnessMetric.parameters,
+            f"\nInput:\n{mlflow_example.input}\n"
+            f"\nOutput:\n{mlflow_example.output}\n"
+            "\nAdditional information used by the model:\nkey: targets\nvalue:\n"
+            f"{mlflow_ground_truth}\n"
+            f"\nscore: {mlflow_example.score}\n"
+            f"justification: {mlflow_example.justification}\n        \n"
+            "\nYou must return the following fields in your response one below the other:\nscore: "
+            "Your numerical score for the model's answer_similarity based on the "
+            "rubric\njustification: Your step-by-step reasoning about the model's "
+            "answer_similarity score\n    ",
+            **AnswerSimilarityMetric.parameters,
         }
 
     assert metric_value.scores == [3]
@@ -558,17 +672,18 @@ def test_correctness_metric():
     }
 
     with pytest.raises(
-        MlflowException, match="Failed to find correctness metric for version non-existent-version"
+        MlflowException,
+        match="Failed to find answer similarity metric for version non-existent-version",
     ):
-        correctness_metric = correctness(
+        answer_similarity(
             model="gateway:/gpt-3.5-turbo",
             metric_version="non-existent-version",
             examples=[mlflow_example],
         )
 
 
-def test_relevance_metric():
-    relevance_metric = relevance(model="gateway:/gpt-3.5-turbo", examples=[])
+def test_faithfulness_metric():
+    faithfulness_metric = faithfulness(model="gateway:/gpt-3.5-turbo", examples=[])
     input = "What is MLflow?"
 
     with mock.patch.object(
@@ -576,7 +691,7 @@ def test_relevance_metric():
         "score_model_on_payload",
         return_value=properly_formatted_openai_response1,
     ) as mock_predict_function:
-        metric_value = relevance_metric.eval_fn(
+        metric_value = faithfulness_metric.eval_fn(
             pd.Series([mlflow_prediction]),
             {},
             pd.Series([input]),
@@ -585,23 +700,27 @@ def test_relevance_metric():
         assert mock_predict_function.call_count == 1
         assert mock_predict_function.call_args[0][0] == "gateway:/gpt-3.5-turbo"
         assert mock_predict_function.call_args[0][1] == {
-            "prompt": "\nPlease act as an impartial judge and evaluate the quality of "
-            "the provided output which\nattempts to produce output for the provided input "
-            "based on a provided information.\n\nYou'll be given a grading format below which "
-            "you'll call for each provided information,\ninput and provided output to submit "
-            "your justification and score to compute the relevance of\nthe output.\n"
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "faithfulness based on the input and output.\nA definition of "
+            "faithfulness and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
             f"\nInput:\n{input}\n"
-            f"\nProvided output:\n{mlflow_prediction}\n"
-            f"\nProvided context: {mlflow_ground_truth}\n"
-            f"\nMetric definition:\n{RelevanceMetric.definition}\n"
-            f"\nBelow is your grading criteria:\n{RelevanceMetric.grading_prompt}\n"
+            f"\nOutput:\n{mlflow_prediction}\n"
+            "\nAdditional information used by the model:\nkey: context\nvalue:\n"
+            f"{mlflow_ground_truth}\n"
+            f"\nMetric definition:\n{FaithfulnessMetric.definition}\n"
+            f"\nGrading rubric:\n{FaithfulnessMetric.grading_prompt}\n"
             "\n\n"
-            "\nAnd you'll need to submit your grading for the relevance of the output,"
-            "\nusing the following in json format:\n"
-            "Score: [your score number for the relevance of the output]\n"
-            "Justification: [your step by step reasoning about the relevance of the output]"
-            "\n    ",
-            **RelevanceMetric.parameters,
+            "\nYou must return the following fields in your response one below the other:\nscore: "
+            "Your numerical score for the model's faithfulness based on the "
+            "rubric\njustification: Your step-by-step reasoning about the model's "
+            "faithfulness score\n    ",
+            **FaithfulnessMetric.parameters,
         }
 
     assert metric_value.scores == [3]
@@ -614,53 +733,113 @@ def test_relevance_metric():
     }
 
     with pytest.raises(
-        MlflowException, match="Failed to find relevance metric for version non-existent-version"
+        MlflowException, match="Failed to find faithfulness metric for version non-existent-version"
     ):
-        relevance_metric = relevance(
+        faithfulness_metric = faithfulness(
             model="gateway:/gpt-3.5-turbo",
             metric_version="non-existent-version",
             examples=[mlflow_example],
         )
 
 
-def test_strict_correctness_metric():
-    strict_correctness_metric = strict_correctness()
+def test_answer_correctness_metric():
+    answer_correctness_metric = answer_correctness()
     input = "What is MLflow?"
-    examples = "\n".join([str(example) for example in StrictCorrectnessMetric.default_examples])
+    examples = "\n".join([str(example) for example in AnswerCorrectnessMetric.default_examples])
 
     with mock.patch.object(
         model_utils,
         "score_model_on_payload",
         return_value=properly_formatted_openai_response1,
     ) as mock_predict_function:
-        metric_value = strict_correctness_metric.eval_fn(
+        metric_value = answer_correctness_metric.eval_fn(
             pd.Series([mlflow_prediction]),
             {},
             pd.Series([input]),
             pd.Series([mlflow_ground_truth]),
         )
         assert mock_predict_function.call_count == 1
-        assert mock_predict_function.call_args[0][0] == "openai:/gpt-3.5-turbo-16k"
+        assert mock_predict_function.call_args[0][0] == "openai:/gpt-4"
         assert mock_predict_function.call_args[0][1] == {
-            "prompt": "\nPlease act as an impartial judge and evaluate the quality of "
-            "the provided output which\nattempts to produce output for the provided input "
-            "based on a provided information.\n\nYou'll be given a grading format below which "
-            "you'll call for each provided information,\ninput and provided output to submit "
-            "your justification and score to compute the strict_correctness of\nthe output.\n"
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "answer_correctness based on the input and output.\nA definition of "
+            "answer_correctness and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
             f"\nInput:\n{input}\n"
-            f"\nProvided output:\n{mlflow_prediction}\n"
-            f"\nProvided targets: {mlflow_ground_truth}\n"
-            f"\nMetric definition:\n{StrictCorrectnessMetric.definition}\n"
-            f"\nBelow is your grading criteria:\n{StrictCorrectnessMetric.grading_prompt}\n"
+            f"\nOutput:\n{mlflow_prediction}\n"
+            "\nAdditional information used by the model:\nkey: targets\nvalue:\n"
+            f"{mlflow_ground_truth}\n"
+            f"\nMetric definition:\n{AnswerCorrectnessMetric.definition}\n"
+            f"\nGrading rubric:\n{AnswerCorrectnessMetric.grading_prompt}\n"
             "\nExamples:\n"
             f"{examples}\n"
-            "\nAnd you'll need to submit your grading for the strict_correctness of the output,"
-            "\nusing the following in json format:\n"
-            "Score: [your score number for the strict_correctness of the output]\n"
-            "Justification: [your step by step reasoning about the strict_correctness of the "
-            "output]"
-            "\n    ",
-            **StrictCorrectnessMetric.parameters,
+            "\nYou must return the following fields in your response one below the other:\nscore: "
+            "Your numerical score for the model's answer_correctness based on the "
+            "rubric\njustification: Your step-by-step reasoning about the model's "
+            "answer_correctness score\n    ",
+            **AnswerCorrectnessMetric.parameters,
+        }
+
+    assert metric_value.scores == [3]
+    assert metric_value.justifications == [openai_justification1]
+
+    assert metric_value.aggregate_results == {
+        "mean": 3,
+        "variance": 0,
+        "p90": 3,
+    }
+
+    with pytest.raises(
+        MlflowException,
+        match="Failed to find answer correctness metric for version non-existent-version",
+    ):
+        answer_correctness(metric_version="non-existent-version")
+
+
+def test_answer_relevance_metric():
+    answer_relevance_metric = answer_relevance(model="gateway:/gpt-3.5-turbo", examples=[])
+    input = "What is MLflow?"
+
+    with mock.patch.object(
+        model_utils,
+        "score_model_on_payload",
+        return_value=properly_formatted_openai_response1,
+    ) as mock_predict_function:
+        metric_value = answer_relevance_metric.eval_fn(
+            pd.Series([mlflow_prediction]),
+            {},
+            pd.Series([input]),
+            pd.Series([mlflow_ground_truth]),
+        )
+        assert mock_predict_function.call_count == 1
+        assert mock_predict_function.call_args[0][0] == "gateway:/gpt-3.5-turbo"
+        assert mock_predict_function.call_args[0][1] == {
+            "prompt": "\nTask:\nYou are an impartial judge. You will be given an input that was "
+            "sent to a machine\nlearning model, and you will be given an output that the model "
+            "produced. You\nmay also be given additional information that was used by the model "
+            "to generate the output.\n\nYour task is to determine a numerical score called "
+            "answer_relevance based on the input and output.\nA definition of "
+            "answer_relevance and a grading rubric are provided below.\nYou must use the "
+            "grading rubric to determine your score. You must also justify your score."
+            "\n\nExamples could be included below for reference. Make sure to use them as "
+            "references and to\nunderstand them before completing the task.\n"
+            f"\nInput:\n{input}\n"
+            f"\nOutput:\n{mlflow_prediction}\n"
+            "\nAdditional information used by the model:\nkey: context\nvalue:\n"
+            f"{mlflow_ground_truth}\n"
+            f"\nMetric definition:\n{AnswerRelevanceMetric.definition}\n"
+            f"\nGrading rubric:\n{AnswerRelevanceMetric.grading_prompt}\n"
+            "\n\n"
+            "\nYou must return the following fields in your response one below the other:\nscore: "
+            "Your numerical score for the model's answer_relevance based on the "
+            "rubric\njustification: Your step-by-step reasoning about the model's "
+            "answer_relevance score\n    ",
+            **AnswerRelevanceMetric.parameters,
         }
 
     assert metric_value.scores == [3]
@@ -674,6 +853,36 @@ def test_strict_correctness_metric():
 
     with pytest.raises(
         MlflowException,
-        match="Failed to find strict correctness metric for version non-existent-version",
+        match="Failed to find answer relevance metric for version non-existent-version",
     ):
-        strict_correctness_metric = strict_correctness(metric_version="non-existent-version")
+        answer_relevance(
+            model="gateway:/gpt-3.5-turbo",
+            metric_version="non-existent-version",
+            examples=[mlflow_example],
+        )
+
+
+def test_make_genai_metric_metric_details():
+    custom_metric = make_genai_metric(
+        name="correctness",
+        version="v1",
+        definition=example_definition,
+        grading_prompt=example_grading_prompt,
+        examples=[mlflow_example],
+        model="gateway:/gpt-3.5-turbo",
+        grading_context_columns=["targets"],
+        parameters={"temperature": 0.0},
+        greater_is_better=True,
+        aggregations=["mean", "variance", "p90"],
+    )
+
+    # pylint: disable=line-too-long
+    expected_metric_details = "\nTask:\nYou are an impartial judge. You will be given an input that was sent to a machine\nlearning model, and you will be given an output that the model produced. You\nmay also be given additional information that was used by the model to generate the output.\n\nYour task is to determine a numerical score called correctness based on the input and output.\nA definition of correctness and a grading rubric are provided below.\nYou must use the grading rubric to determine your score. You must also justify your score.\n\nExamples could be included below for reference. Make sure to use them as references and to\nunderstand them before completing the task.\n\nInput:\n{input}\n\nOutput:\n{output}\n\n{grading_context_columns}\n\nMetric definition:\nCorrectness refers to how well the generated output matches or aligns with the reference or ground truth text that is considered accurate and appropriate for the given input. The ground truth serves as a benchmark against which the provided output is compared to determine the level of accuracy and fidelity.\n\nGrading rubric:\nCorrectness: If the answer correctly answer the question, below are the details for different scores: - Score 0: the answer is completely incorrect, doesn’t mention anything about the question or is completely contrary to the correct answer. - Score 1: the answer provides some relevance to the question and answer one aspect of the question correctly. - Score 2: the answer mostly answer the question but is missing or hallucinating on one critical aspect. - Score 4: the answer correctly answer the question and not missing any major aspect\n\nExamples:\n\nInput:\nWhat is MLflow?\n\nOutput:\nMLflow is an open-source platform for managing machine learning workflows, including experiment tracking, model packaging, versioning, and deployment, simplifying the ML lifecycle.\n\nAdditional information used by the model:\nkey: targets\nvalue:\nMLflow is an open-source platform for managing the end-to-end machine learning (ML) lifecycle. It was developed by Databricks, a company that specializes in big data and machine learning solutions. MLflow is designed to address the challenges that data scientists and machine learning engineers face when developing, training, and deploying machine learning models.\n\nscore: 4\njustification: The definition effectively explains what MLflow is its purpose, and its developer. It could be more concise for a 5-score.\n        \n\nYou must return the following fields in your response one below the other:\nscore: Your numerical score for the model's correctness based on the rubric\njustification: Your step-by-step reasoning about the model's correctness score\n    "
+
+    assert custom_metric.metric_details == expected_metric_details
+
+    assert (
+        custom_metric.__str__()
+        == f"EvaluationMetric(name=correctness, greater_is_better=True, long_name=correctness, version=v1, metric_details={expected_metric_details})"
+    )
+    # pylint: enable=line-too-long
diff --git a/tests/metrics/genai/test_model_utils.py b/tests/metrics/genai/test_model_utils.py
index 8611ada0310dd..049be652ac0e4 100644
--- a/tests/metrics/genai/test_model_utils.py
+++ b/tests/metrics/genai/test_model_utils.py
@@ -19,6 +19,19 @@ def set_envs(monkeypatch):
     )
 
 
+@pytest.fixture
+def set_azure_envs(monkeypatch):
+    monkeypatch.setenvs(
+        {
+            "OPENAI_API_KEY": "test",
+            "OPENAI_API_TYPE": "azure",
+            "OPENAI_API_VERSION": "2023-05-15",
+            "OPENAI_API_BASE": "https://openai-for.openai.azure.com/",
+            "OPENAI_DEPLOYMENT_NAME": "test-openai",
+        }
+    )
+
+
 def test_parse_model_uri():
     prefix, suffix = _parse_model_uri("openai:/gpt-3.5-turbo")
 
@@ -43,12 +56,12 @@ def test_parse_model_uri_throws_for_malformed():
 
 def test_score_model_on_payload_throws_for_invalid():
     with pytest.raises(MlflowException, match="Unknown model uri prefix"):
-        score_model_on_payload("myprovider:/gpt-3.5-turbo", {})
+        score_model_on_payload("myprovider:/gpt-3.5-turbo", {}, 10)
 
 
 def test_score_model_openai_without_key():
     with pytest.raises(MlflowException, match="OPENAI_API_KEY environment variable not set"):
-        score_model_on_payload("openai:/gpt-3.5-turbo", {})
+        score_model_on_payload("openai:/gpt-3.5-turbo", {}, 10)
 
 
 def test_score_model_openai(set_envs):
@@ -86,7 +99,9 @@ def json(self):
     }
 
     with mock.patch("requests.post", return_value=MockResponse(resp, 200)) as mock_post:
-        score_model_on_payload("openai:/gpt-3.5-turbo", {"prompt": "my prompt", "temperature": 0.1})
+        score_model_on_payload(
+            "openai:/gpt-3.5-turbo", {"prompt": "my prompt", "temperature": 0.1}, 10
+        )
         mock_post.assert_called_once_with(
             url="https://api.openai.com/v1/chat/completions",
             headers={"Authorization": "Bearer test"},
@@ -95,6 +110,57 @@ def json(self):
                 "temperature": 0.2,
                 "messages": [{"role": "user", "content": "my prompt"}],
             },
+            timeout=10,
+        )
+
+
+def test_score_model_azure_openai(set_azure_envs):
+    class MockResponse(Response):
+        def __init__(self, json_data, status_code):
+            super().__init__()
+            self.json_data = json_data
+            self.status_code = status_code
+            self.headers = {"Content-Type": "application/json"}
+
+        def json(self):
+            return self.json_data
+
+    resp = {
+        "id": "chatcmpl-abc123",
+        "object": "chat.completion",
+        "created": 1677858242,
+        "model": "gpt-3.5-turbo-0301",
+        "usage": {
+            "prompt_tokens": 13,
+            "completion_tokens": 7,
+            "total_tokens": 20,
+        },
+        "choices": [
+            {
+                "message": {
+                    "role": "assistant",
+                    "content": "\n\nThis is a test!",
+                },
+                "finish_reason": "stop",
+                "index": 0,
+            }
+        ],
+        "headers": {"Content-Type": "application/json"},
+    }
+
+    with mock.patch("requests.post", return_value=MockResponse(resp, 200)) as mock_post:
+        score_model_on_payload(
+            "openai:/gpt-3.5-turbo", {"prompt": "my prompt", "temperature": 0.1}, 10
+        )
+        mock_post.assert_called_once_with(
+            url="https://openai-for.openai.azure.com/openai/deployments/test-openai/chat/"
+            "completions?api-version=2023-05-15",
+            headers={"api-key": "test"},
+            json={
+                "temperature": 0.2,
+                "messages": [{"role": "user", "content": "my prompt"}],
+            },
+            timeout=10,
         )
 
 
@@ -120,5 +186,5 @@ def test_score_model_gateway():
     }
 
     with mock.patch("mlflow.gateway.query", return_value=expected_output):
-        response = score_model_on_payload("gateway:/my-route", {})
+        response = score_model_on_payload("gateway:/my-route", {}, 10)
         assert response == expected_output
diff --git a/tests/metrics/test_base.py b/tests/metrics/test_base.py
index 19065fbf31a6d..f0aa54bd6243e 100644
--- a/tests/metrics/test_base.py
+++ b/tests/metrics/test_base.py
@@ -14,18 +14,35 @@ def test_evaluation_example_str():
         )
     )
     example1_expected = """
-        Input: This is an input
-        Provided output: This is an output
-        Provided foo: bar
-        Score: 5
-        Justification: This is a justification
+        Input:
+        This is an input
+
+        Output:
+        This is an output
+
+        Additional information used by the model:
+        key: foo
+        value:
+        bar
+
+        score: 5
+        justification: This is a justification
         """
     assert re.sub(r"\s+", "", example1_expected) == re.sub(r"\s+", "", example1)
 
-    example2 = str(EvaluationExample(input="This is an input", output="This is an output", score=5))
+    example2 = str(
+        EvaluationExample(
+            input="This is an input", output="This is an output", score=5, justification="It works"
+        )
+    )
     example2_expected = """
-        Input: This is an input
-        Provided output: This is an output
-        Score: 5
+        Input:
+        This is an input
+
+        Output:
+        This is an output
+
+        score: 5
+        justification: It works
         """
     assert re.sub(r"\s+", "", example2_expected) == re.sub(r"\s+", "", example2)
diff --git a/tests/metrics/test_metric_definitions.py b/tests/metrics/test_metric_definitions.py
index 1cb0768f571e6..31bfe8232d789 100644
--- a/tests/metrics/test_metric_definitions.py
+++ b/tests/metrics/test_metric_definitions.py
@@ -13,7 +13,6 @@
     mape,
     max_error,
     mse,
-    perplexity,
     precision_score,
     r2_score,
     recall_score,
@@ -29,15 +28,14 @@
 @pytest.mark.parametrize(
     "metric",
     [
-        ari_grade_level,
-        exact_match,
-        flesch_kincaid_grade_level,
-        perplexity,
-        rouge1,
-        rouge2,
-        rougeL,
-        rougeLsum,
-        toxicity,
+        ari_grade_level(),
+        exact_match(),
+        flesch_kincaid_grade_level(),
+        rouge1(),
+        rouge2(),
+        rougeL(),
+        rougeLsum(),
+        toxicity(),
     ],
 )
 def test_return_type_and_len_with_target(metric):
@@ -61,7 +59,7 @@ def _is_toxic(score):
 
 def test_toxicity():
     predictions = pd.Series(["A normal sentence", "All women are bad"])
-    result = toxicity.eval_fn(predictions, None, {})
+    result = toxicity().eval_fn(predictions, None, {})
     assert not _is_toxic(result.scores[0])
     assert _is_toxic(result.scores[1])
     assert result.aggregate_results["ratio"] == 0.5
@@ -70,16 +68,6 @@ def test_toxicity():
     assert "variance" in result.aggregate_results
 
 
-def test_perplexity():
-    predictions = pd.Series(["sentence not", "This is a sentence"])
-    result = perplexity.eval_fn(predictions, None, {})
-    # A properly structured sentence should have lower perplexity
-    assert result.scores[0] > result.scores[1]
-    assert result.aggregate_results["mean"] == (result.scores[0] + result.scores[1]) / 2
-    assert result.scores[0] > result.aggregate_results["p90"] > result.scores[1]
-    assert "variance" in result.aggregate_results
-
-
 def test_flesch_kincaid_grade_level():
     predictions = pd.Series(
         [
@@ -90,7 +78,7 @@ def test_flesch_kincaid_grade_level():
             ),
         ]
     )
-    result = flesch_kincaid_grade_level.eval_fn(predictions, None, {})
+    result = flesch_kincaid_grade_level().eval_fn(predictions, None, {})
     assert result.scores[0] < result.scores[1]
     assert result.aggregate_results["mean"] == (result.scores[0] + result.scores[1]) / 2
     assert result.scores[0] < result.aggregate_results["p90"] < result.scores[1]
@@ -107,7 +95,7 @@ def test_ari_grade_level():
             ),
         ]
     )
-    result = ari_grade_level.eval_fn(predictions, None, {})
+    result = ari_grade_level().eval_fn(predictions, None, {})
     assert result.scores[0] < result.scores[1]
     assert result.aggregate_results["mean"] == (result.scores[0] + result.scores[1]) / 2
     assert result.scores[0] < result.aggregate_results["p90"] < result.scores[1]
@@ -118,19 +106,19 @@ def test_exact_match():
     predictions = pd.Series(["sentence not", "random text", "a", "c"])
     targets = pd.Series(["sentence not", "random text", "a", "c"])
 
-    result = exact_match.eval_fn(predictions, targets, {})
+    result = exact_match().eval_fn(predictions, targets, {})
     assert result.aggregate_results["exact_match"] == 1.0
 
     predictions = pd.Series(["not sentence", "random text", "b", "c"])
     targets = pd.Series(["sentence not", "random text", "a", "c"])
-    result = exact_match.eval_fn(predictions, targets, {})
+    result = exact_match().eval_fn(predictions, targets, {})
     assert result.aggregate_results["exact_match"] == 0.5
 
 
 def test_rouge1():
     predictions = pd.Series(["a", "d c"])
     targets = pd.Series(["d", "b c"])
-    result = rouge1.eval_fn(predictions, targets, {})
+    result = rouge1().eval_fn(predictions, targets, {})
     assert result.scores[0] == 0.0
     assert result.scores[1] == 0.5
     assert result.aggregate_results["mean"] == 0.25
@@ -141,7 +129,7 @@ def test_rouge1():
 def test_rouge2():
     predictions = pd.Series(["a e", "b c e"])
     targets = pd.Series(["a e", "b c d"])
-    result = rouge2.eval_fn(predictions, targets, {})
+    result = rouge2().eval_fn(predictions, targets, {})
     assert result.scores[0] == 1.0
     assert result.scores[1] == 0.5
     assert result.aggregate_results["mean"] == 0.75
@@ -152,7 +140,7 @@ def test_rouge2():
 def test_rougeL():
     predictions = pd.Series(["a", "b c"])
     targets = pd.Series(["d", "b c"])
-    result = rougeL.eval_fn(predictions, targets, {})
+    result = rougeL().eval_fn(predictions, targets, {})
     assert result.scores[0] == 0.0
     assert result.scores[1] == 1.0
     assert result.aggregate_results["mean"] == 0.5
@@ -163,7 +151,7 @@ def test_rougeL():
 def test_rougeLsum():
     predictions = pd.Series(["a", "b c"])
     targets = pd.Series(["d", "b c"])
-    result = rougeLsum.eval_fn(predictions, targets, {})
+    result = rougeLsum().eval_fn(predictions, targets, {})
     assert result.scores[0] == 0.0
     assert result.scores[1] == 1.0
     assert result.aggregate_results["mean"] == 0.5
@@ -172,11 +160,17 @@ def test_rougeLsum():
 
 
 def test_fails_to_load_metric():
+    from mlflow.metrics.metric_definitions import _cached_evaluate_load
+
+    _cached_evaluate_load.cache_clear()
+
     predictions = pd.Series(["random text", "This is a sentence"])
     e = ImportError("mocked error")
-    with mock.patch("evaluate.load", side_effect=e) as mock_load:
+    with mock.patch(
+        "mlflow.metrics.metric_definitions._cached_evaluate_load", side_effect=e
+    ) as mock_load:
         with mock.patch("mlflow.metrics.metric_definitions._logger.warning") as mock_warning:
-            toxicity.eval_fn(predictions, None, {})
+            toxicity().eval_fn(predictions, None, {})
             mock_load.assert_called_once_with("toxicity", module_type="measurement")
             mock_warning.assert_called_once_with(
                 f"Failed to load 'toxicity' metric (error: {e!r}), skipping metric logging.",
@@ -186,61 +180,61 @@ def test_fails_to_load_metric():
 def test_mae():
     predictions = pd.Series([1.0, 2.0, 0.0])
     targets = pd.Series([1.0, 2.0, 3.0])
-    result = mae.eval_fn(predictions, targets, {})
+    result = mae().eval_fn(predictions, targets, {})
     assert result.aggregate_results["mean_absolute_error"] == 1.0
 
 
 def test_mse():
     predictions = pd.Series([1.0, 2.0, 0.0])
     targets = pd.Series([1.0, 2.0, 3.0])
-    result = mse.eval_fn(predictions, targets, {})
+    result = mse().eval_fn(predictions, targets, {})
     assert result.aggregate_results["mean_squared_error"] == 3.0
 
 
 def test_rmse():
     predictions = pd.Series([4.0, 5.0, 0.0])
     targets = pd.Series([1.0, 2.0, 3.0])
-    result = rmse.eval_fn(predictions, targets, {})
+    result = rmse().eval_fn(predictions, targets, {})
     assert result.aggregate_results["root_mean_squared_error"] == 3.0
 
 
 def test_r2_score():
     predictions = pd.Series([1.0, 2.0, 3.0])
     targets = pd.Series([3.0, 2.0, 1.0])
-    result = r2_score.eval_fn(predictions, targets, {})
+    result = r2_score().eval_fn(predictions, targets, {})
     assert result.aggregate_results["r2_score"] == -3.0
 
 
 def test_max_error():
     predictions = pd.Series([1.0, 2.0, 3.0])
     targets = pd.Series([3.0, 2.0, 1.0])
-    result = max_error.eval_fn(predictions, targets, {})
+    result = max_error().eval_fn(predictions, targets, {})
     assert result.aggregate_results["max_error"] == 2.0
 
 
 def test_mape_error():
     predictions = pd.Series([1.0, 1.0, 1.0])
     targets = pd.Series([2.0, 2.0, 2.0])
-    result = mape.eval_fn(predictions, targets, {})
+    result = mape().eval_fn(predictions, targets, {})
     assert result.aggregate_results["mean_absolute_percentage_error"] == 0.5
 
 
 def test_binary_recall_score():
     predictions = pd.Series([0, 0, 1, 1, 0, 0, 0, 1])
     targets = pd.Series([1, 1, 1, 1, 0, 0, 0, 0])
-    result = recall_score.eval_fn(predictions, targets, {})
+    result = recall_score().eval_fn(predictions, targets, {})
     assert abs(result.aggregate_results["recall_score"] - 0.5) < 1e-3
 
 
 def test_binary_precision():
     predictions = pd.Series([0, 0, 1, 1, 0, 0, 0, 1])
     targets = pd.Series([1, 1, 1, 1, 0, 0, 0, 0])
-    result = precision_score.eval_fn(predictions, targets, {})
+    result = precision_score().eval_fn(predictions, targets, {})
     assert abs(result.aggregate_results["precision_score"] == 0.666) < 1e-3
 
 
 def test_binary_f1_score():
     predictions = pd.Series([0, 0, 1, 1, 0, 0, 0, 1])
     targets = pd.Series([1, 1, 1, 1, 0, 0, 0, 0])
-    result = f1_score.eval_fn(predictions, targets, {})
+    result = f1_score().eval_fn(predictions, targets, {})
     assert abs(result.aggregate_results["f1_score"] - 0.5713) < 1e-3
diff --git a/tests/mleap/test_mleap_model_export.py b/tests/mleap/test_mleap_model_export.py
index 75934fc9457ea..a410bad38e898 100644
--- a/tests/mleap/test_mleap_model_export.py
+++ b/tests/mleap/test_mleap_model_export.py
@@ -44,13 +44,13 @@ def get_mleap_jars():
 
 
 @pytest.fixture(scope="module")
-def spark_context():
+def spark():
     conf = pyspark.SparkConf()
     conf.set(key="spark.jars.packages", value=get_mleap_jars())
     # Exclude `net.sourceforge.f2j` to avoid `java.io.FileNotFoundException`
     conf.set(key="spark.jars.excludes", value="net.sourceforge.f2j:arpack_combined_all")
     with get_spark_session(conf) as spark_session:
-        yield spark_session.sparkContext
+        yield spark_session
 
 
 @pytest.mark.skipif(
diff --git a/tests/models/test_signature.py b/tests/models/test_signature.py
index 78c1f38484abd..d8525d6bcced0 100644
--- a/tests/models/test_signature.py
+++ b/tests/models/test_signature.py
@@ -163,14 +163,14 @@ def test_signature_inference_infers_datime_types_as_expected():
     signature = infer_signature(test_df)
     assert signature.inputs == Schema([ColSpec(DataType.datetime, name=col_name)])
 
-    spark = pyspark.sql.SparkSession.builder.getOrCreate()
-    spark_df = spark.range(1).selectExpr(
-        "current_timestamp() as timestamp", "current_date() as date"
-    )
-    signature = infer_signature(spark_df)
-    assert signature.inputs == Schema(
-        [ColSpec(DataType.datetime, name="timestamp"), ColSpec(DataType.datetime, name="date")]
-    )
+    with pyspark.sql.SparkSession.builder.getOrCreate() as spark:
+        spark_df = spark.range(1).selectExpr(
+            "current_timestamp() as timestamp", "current_date() as date"
+        )
+        signature = infer_signature(spark_df)
+        assert signature.inputs == Schema(
+            [ColSpec(DataType.datetime, name="timestamp"), ColSpec(DataType.datetime, name="date")]
+        )
 
 
 def test_set_signature_to_logged_model():
diff --git a/tests/openai/test_openai_init.py b/tests/openai/test_openai_init.py
new file mode 100644
index 0000000000000..5c268f11ae2e0
--- /dev/null
+++ b/tests/openai/test_openai_init.py
@@ -0,0 +1,26 @@
+import os
+from importlib import reload
+
+from mlflow.openai import _OAITokenHolder
+
+
+def test_set_api_key_on_tokenholder_init(monkeypatch):
+    # if the user sets the API key after the openai module,
+    # expect `openai.api_key` to not be set.
+    monkeypatch.delenv("OPENAI_API_KEY", False)
+    assert "OPENAI_API_KEY" not in os.environ
+
+    import openai
+
+    monkeypatch.setenv("OPENAI_API_KEY", "test-key")
+    assert openai.api_key is None
+
+    # when OAITokenHolder is initialized, expect it to set `openai.api_key`
+    token_holder = _OAITokenHolder("open_ai")
+    assert openai.api_key == "test-key"
+    assert token_holder._key_configured
+
+    # reload the module to simulate the env var being set before
+    # load. in this case we'd expect the API key to be present
+    reload(openai)
+    assert openai.api_key == "test-key"
diff --git a/tests/openai/test_openai_model_export.py b/tests/openai/test_openai_model_export.py
index 6e1a776097f44..ec70920b85264 100644
--- a/tests/openai/test_openai_model_export.py
+++ b/tests/openai/test_openai_model_export.py
@@ -2,6 +2,7 @@
 import json
 from unittest import mock
 
+import numpy as np
 import openai
 import openai.error
 import pandas as pd
@@ -11,11 +12,13 @@
 
 import mlflow
 import mlflow.pyfunc.scoring_server as pyfunc_scoring_server
+from mlflow.models.signature import ModelSignature
 from mlflow.openai.utils import (
     _mock_chat_completion_response,
     _mock_models_retrieve_response,
     _mock_request,
 )
+from mlflow.types.schema import ColSpec, ParamSchema, ParamSpec, Schema, TensorSpec
 
 from tests.helper_functions import pyfunc_serve_and_score_model
 
@@ -560,6 +563,18 @@ def test_embeddings(tmp_path):
     assert preds == [[0.0]] * 100
 
 
+def test_embeddings_batch_size_azure(tmp_path, monkeypatch):
+    monkeypatch.setenv("OPENAI_API_TYPE", "azure")
+    mlflow.openai.save_model(
+        model="text-embedding-ada-002",
+        task=openai.Embedding,
+        path=tmp_path,
+    )
+    model = mlflow.pyfunc.load_model(tmp_path)
+
+    assert model._model_impl.api_config.batch_size == 16
+
+
 def test_embeddings_pyfunc_server_and_score(tmp_path):
     mlflow.openai.save_model(
         model="text-embedding-ada-002",
@@ -594,3 +609,42 @@ def test_spark_udf_embeddings(tmp_path, spark):
     )
     df = df.withColumn("z", udf("x")).toPandas()
     assert df["z"].tolist() == [[0.0], [0.0]]
+
+
+def test_inference_params(tmp_path):
+    mlflow.openai.save_model(
+        model="text-embedding-ada-002",
+        task=openai.Embedding,
+        path=tmp_path,
+        signature=ModelSignature(
+            inputs=Schema([ColSpec(type="string", name=None)]),
+            outputs=Schema([TensorSpec(type=np.dtype("float64"), shape=(-1,))]),
+            params=ParamSchema([ParamSpec(name="batch_size", dtype="long", default=16)]),
+        ),
+    )
+
+    model_info = mlflow.models.Model.load(tmp_path)
+    assert (
+        len([p for p in model_info.signature.params if p.name == "batch_size" and p.default == 16])
+        == 1
+    )
+
+    model = mlflow.pyfunc.load_model(tmp_path)
+    data = pd.DataFrame({"text": ["a", "b"]})
+    preds = model.predict(data, params={"batch_size": 5})
+    assert preds == [[0.0], [0.0]]
+
+
+def test_inference_params_overlap(tmp_path):
+    with pytest.raises(mlflow.MlflowException, match=r"any of \['prefix'\] as parameters"):
+        mlflow.openai.save_model(
+            model="text-davinci-003",
+            task=openai.Completion,
+            path=tmp_path,
+            prefix="Classify the following text's sentiment:",
+            signature=ModelSignature(
+                inputs=Schema([ColSpec(type="string", name=None)]),
+                outputs=Schema([ColSpec(type="string", name=None)]),
+                params=ParamSchema([ParamSpec(name="prefix", default=None, dtype="string")]),
+            ),
+        )
diff --git a/tests/projects/test_projects_cli.py b/tests/projects/test_projects_cli.py
index dd7f99876304d..f123a7f3fd63f 100644
--- a/tests/projects/test_projects_cli.py
+++ b/tests/projects/test_projects_cli.py
@@ -1,4 +1,3 @@
-import hashlib
 import json
 import logging
 import os
@@ -9,7 +8,7 @@
 from click.testing import CliRunner
 
 from mlflow import MlflowClient, cli
-from mlflow.utils import process
+from mlflow.utils import insecure_hash, process
 
 from tests.integration.utils import invoke_cli_runner
 from tests.projects.utils import (
@@ -90,7 +89,9 @@ def clean_mlruns_dir():
 def test_run_local_conda_env():
     with open(os.path.join(TEST_PROJECT_DIR, "conda.yaml")) as handle:
         conda_env_contents = handle.read()
-    expected_env_name = "mlflow-%s" % hashlib.sha1(conda_env_contents.encode("utf-8")).hexdigest()
+    expected_env_name = (
+        "mlflow-%s" % insecure_hash.sha1(conda_env_contents.encode("utf-8")).hexdigest()
+    )
     try:
         process._exec_cmd(cmd=["conda", "env", "remove", "--name", expected_env_name])
     except process.ShellCommandException:
diff --git a/tests/pyfunc/test_scoring_server.py b/tests/pyfunc/test_scoring_server.py
index 4aeb77a5f8574..0e2cd9025c780 100644
--- a/tests/pyfunc/test_scoring_server.py
+++ b/tests/pyfunc/test_scoring_server.py
@@ -418,7 +418,7 @@ def test_parse_json_input_split_oriented():
 def test_records_oriented_json_to_df():
     # test that datatype for "zip" column is not converted to "int64"
     jstr = """
-      { 
+      {
         "dataframe_records": [
           {"zip":"95120","cost":10.45,"score":8},
           {"zip":"95128","cost":23.0,"score":0},
@@ -444,9 +444,9 @@ def test_split_oriented_json_to_df():
       {
         "dataframe_split": {
           "columns":["zip","cost","count"],
-          "index":[0,1,2], 
+          "index":[0,1,2],
           "data":[["95120",10.45,-8],["95128",23.0,-1],["95128",12.1,1000]]
-        }  
+        }
       }
     """
     jstr, _ = pyfunc_scoring_server._split_data_and_params(jstr)
diff --git a/tests/pyfunc/test_spark.py b/tests/pyfunc/test_spark.py
index 6d7cdf1bfd20f..ea7ce0ef43140 100644
--- a/tests/pyfunc/test_spark.py
+++ b/tests/pyfunc/test_spark.py
@@ -57,8 +57,7 @@
 types = [np.int32, int, str, np.float32, np.double, bool]
 
 
-def score_model_as_udf(model_uri, pandas_df, result_type="double"):
-    spark = get_spark_session(pyspark.SparkConf())
+def score_spark(spark, model_uri, pandas_df, result_type="double"):
     spark_df = spark.createDataFrame(pandas_df).coalesce(1)
     pyfunc_udf = spark_udf(
         spark=spark, model_uri=model_uri, result_type=result_type, env_manager="local"
@@ -67,6 +66,16 @@ def score_model_as_udf(model_uri, pandas_df, result_type="double"):
     return [x["prediction"] for x in new_df.collect()]
 
 
+def score_model_as_udf(model_uri, pandas_df, result_type="double"):
+    if spark := pyspark.sql.SparkSession.getActiveSession():
+        # Reuse the active SparkSession, don't kill it after use
+        return score_spark(spark, model_uri, pandas_df, result_type)
+
+    # Create a new SparkSession, kill it after use
+    with get_spark_session(pyspark.SparkConf()) as spark:
+        return score_spark(spark, model_uri, pandas_df, result_type)
+
+
 class ConstantPyfuncWrapper:
     @staticmethod
     def predict(model_input):
diff --git a/tests/pyfunc/test_spark_connect.py b/tests/pyfunc/test_spark_connect.py
index a6defa70d12fd..feedfaddc24ba 100644
--- a/tests/pyfunc/test_spark_connect.py
+++ b/tests/pyfunc/test_spark_connect.py
@@ -41,3 +41,20 @@ def test_spark_udf_spark_connect_unsupported_env_manager(spark, tmp_path, env_ma
         match=f"Environment manager {env_manager!r} is not supported",
     ):
         mlflow.pyfunc.spark_udf(spark, str(tmp_path), env_manager=env_manager)
+
+
+def test_spark_udf_spark_connect_with_model_logging(spark, tmp_path):
+    X, y = load_iris(return_X_y=True, as_frame=True)
+    model = LogisticRegression().fit(X, y)
+
+    mlflow.set_tracking_uri(tmp_path.joinpath("mlruns").as_uri())
+    mlflow.set_experiment("test")
+    with mlflow.start_run():
+        signature = mlflow.models.infer_signature(X, y)
+        model_info = mlflow.sklearn.log_model(model, "model", signature=signature)
+
+    udf = mlflow.pyfunc.spark_udf(spark, model_info.model_uri, env_manager="local")
+    X_test = X.head(5)
+    sdf = spark.createDataFrame(X_test)
+    preds = sdf.select(udf(*X_test.columns).alias("preds")).toPandas()["preds"]
+    np.testing.assert_array_almost_equal(preds, model.predict(X_test))
diff --git a/tests/recipes/test_ingest_step.py b/tests/recipes/test_ingest_step.py
index a5799d8f74131..4da8f2f9f5f4d 100644
--- a/tests/recipes/test_ingest_step.py
+++ b/tests/recipes/test_ingest_step.py
@@ -35,7 +35,7 @@ def spark_session():
     with tempfile.TemporaryDirectory() as tmpdir:
         with (
             SparkSession.builder.master("local[*]")
-            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+            .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config(
                 "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/recipes/test_predict_step.py b/tests/recipes/test_predict_step.py
index 60bb8d3846879..987bfabfc5298 100644
--- a/tests/recipes/test_predict_step.py
+++ b/tests/recipes/test_predict_step.py
@@ -28,7 +28,7 @@ def spark_session():
     with tempfile.TemporaryDirectory() as tmpdir:
         with (
             SparkSession.builder.master("local[*]")
-            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0")
+            .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0")
             .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
             .config(
                 "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
diff --git a/tests/recipes/test_train_step.py b/tests/recipes/test_train_step.py
index 149c15ae58f01..c83ed35644e25 100644
--- a/tests/recipes/test_train_step.py
+++ b/tests/recipes/test_train_step.py
@@ -170,8 +170,6 @@ def setup_train_step_with_tuning(
 
 def test_train_step(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
@@ -195,7 +193,7 @@ def test_train_step(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path):
     m_train.estimator_fn = estimator_fn
 
     recipe_config = read_yaml(tmp_recipe_root_path, _RECIPE_CONFIG_FILE_NAME)
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step = TrainStep.from_recipe_config(recipe_config, str(tmp_recipe_root_path))
         train_step.run(str(train_step_output_dir))
 
@@ -205,13 +203,25 @@ def test_train_step(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path):
     assert "training_mean_squared_error" in metrics
 
 
+@pytest.fixture(autouse=True)
+def dummy_train_step(tmp_recipe_root_path, monkeypatch):
+    # `mock.patch("steps.train.estimator_fn", ...)` would fail without this fixture
+    steps = tmp_recipe_root_path / "steps"
+    steps.mkdir(exist_ok=True)
+    steps.joinpath("train.py").write_text(
+        """
+def estimator_fn(estimator_params=None):
+    return None
+"""
+    )
+    monkeypatch.syspath_prepend(str(tmp_recipe_root_path))
+
+
 @mock.patch("mlflow.recipes.steps.train._REBALANCING_CUTOFF", 50)
 def test_train_step_imbalanced_data(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path):
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/multiclass"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
@@ -233,9 +243,7 @@ def test_train_step_imbalanced_data(tmp_recipe_root_path: Path, tmp_recipe_exec_
                     enabled: false
         """
     )
-    m_train = Mock()
-    m_train.estimator_fn = classifier_estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", classifier_estimator_fn):
         recipe_config = read_yaml(tmp_recipe_root_path, _RECIPE_CONFIG_FILE_NAME)
         train_step = TrainStep.from_recipe_config(recipe_config, str(tmp_recipe_root_path))
         train_step.run(str(train_step_output_dir))
@@ -254,8 +262,6 @@ def test_train_step_classifier_automl(
     tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path, recipe
 ):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path, recipe=recipe)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         """
@@ -364,12 +370,8 @@ def test_train_steps_writes_model_pkl_and_card(
     tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path, use_tuning
 ):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning)
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
 
     assert (train_step_output_dir / "model/python_model.pkl").exists()
@@ -389,12 +391,8 @@ def test_train_steps_writes_card_with_model_and_run_links_on_databricks(
     monkeypatch.setenv("_DATABRICKS_WORKSPACE_ID", workspace_id)
 
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning)
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
 
     with open(train_step_output_dir / "run_id") as f:
@@ -414,12 +412,10 @@ def test_train_steps_writes_card_with_model_and_run_links_on_databricks(
 @pytest.mark.parametrize("use_tuning", [True, False])
 def test_train_steps_autologs(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path, use_tuning):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning)
     m_train = Mock()
     m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
 
     assert os.path.exists(train_step_output_dir / "run_id")
@@ -440,8 +436,6 @@ def test_train_steps_with_correct_tags(
 ):
     monkeypatch.setenv(MLFLOW_RECIPES_EXECUTION_TARGET_STEP_NAME.name, "train")
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning)
     m_train = Mock()
     m_train.estimator_fn = estimator_fn
@@ -465,12 +459,8 @@ def test_train_step_with_tuning_best_parameters(
     tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path
 ):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning=True)
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
 
     assert (train_step_output_dir / "best_parameters.yaml").exists()
@@ -500,14 +490,10 @@ def test_train_step_with_tuning_output_yaml_correct(
     num_sections,
 ):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(
         tmp_recipe_root_path, use_tuning=True, with_hardcoded_params=with_hardcoded_params
     )
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
     assert (train_step_output_dir / "best_parameters.yaml").exists()
 
@@ -528,12 +514,8 @@ def test_train_step_with_tuning_child_runs_and_early_stop(
     tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path
 ):
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     train_step = setup_train_step_with_tuning(tmp_recipe_root_path, use_tuning=True)
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step.run(str(train_step_output_dir))
 
     with open(train_step_output_dir / "run_id") as f:
@@ -591,7 +573,7 @@ def test_automl(
     monkeypatch.setenv(MLFLOW_RECIPES_EXECUTION_TARGET_STEP_NAME.name, "train")
     train_step_output_dir = setup_train_dataset(tmp_recipe_exec_path)
     recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
+    recipe_steps_dir.mkdir(exist_ok=True)
     if generate_custom_metrics:
         recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
         recipe_steps_dir.joinpath("custom_metrics.py").write_text(
@@ -618,9 +600,8 @@ def weighted_mean_squared_error(eval_df, builtin_metrics):
             use_tuning=True,
             with_hardcoded_params=False,
         )
-    m_train = Mock()
-    m_train.estimator_fn = estimator_fn
-    with mock.patch.dict("sys.modules", {"steps.train": m_train}):
+
+    with mock.patch("steps.train.estimator_fn", estimator_fn):
         train_step._validate_and_apply_step_config()
         train_step._run(str(train_step_output_dir))
 
@@ -636,8 +617,6 @@ def test_tuning_multiclass(tmp_recipe_root_path: Path, tmp_recipe_exec_path: Pat
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/multiclass"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
 
     train_step = setup_train_step_with_tuning(
         tmp_recipe_root_path,
@@ -673,8 +652,6 @@ def test_train_step_with_predict_probability(
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/binary"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
@@ -742,8 +719,6 @@ def test_train_step_with_predict_probability_with_custom_prefix(
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/binary"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
@@ -797,8 +772,6 @@ def test_train_step_with_label_encoding(tmp_recipe_root_path: Path, tmp_recipe_e
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/multiclass"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
@@ -843,14 +816,16 @@ def test_train_step_with_label_encoding(tmp_recipe_root_path: Path, tmp_recipe_e
     assert np.array_equal(np.unique(predicted_label), np.array(["a1", "a2", "a3", "b"]))
 
 
+@pytest.mark.skipif(
+    os.name == "nt",
+    reason="Flaky on windows, sometimes fails with `(sqlite3.OperationalError) database is locked`",
+)
 def test_train_step_with_probability_calibration(
     tmp_recipe_root_path: Path, tmp_recipe_exec_path: Path
 ):
     train_step_output_dir = setup_train_dataset(
         tmp_recipe_exec_path, recipe="classification/binary"
     )
-    recipe_steps_dir = tmp_recipe_root_path.joinpath("steps")
-    recipe_steps_dir.mkdir(parents=True)
     recipe_yaml = tmp_recipe_root_path.joinpath(_RECIPE_CONFIG_FILE_NAME)
     recipe_yaml.write_text(
         f"""
diff --git a/tests/recipes/test_transform_step.py b/tests/recipes/test_transform_step.py
index 384842968f7a4..8a1516375be87 100644
--- a/tests/recipes/test_transform_step.py
+++ b/tests/recipes/test_transform_step.py
@@ -1,7 +1,6 @@
 import os
 from pathlib import Path
 from unittest import mock
-from unittest.mock import Mock
 
 import pandas as pd
 import pytest
@@ -15,6 +14,20 @@
 from mlflow.utils.file_utils import read_yaml
 
 
+@pytest.fixture(autouse=True)
+def dummy_transform_step(tmp_recipe_root_path, monkeypatch):
+    # `mock.patch("steps.transform.transformer_fn", ...)` would fail without this fixture
+    steps = tmp_recipe_root_path / "steps"
+    steps.mkdir(exist_ok=True)
+    steps.joinpath("transform.py").write_text(
+        """
+def transformer_fn(estimator_params=None):
+    return None
+"""
+    )
+    monkeypatch.syspath_prepend(str(tmp_recipe_root_path))
+
+
 # Sets up the transform step and returns the constructed TransformStep instance and step output dir
 def set_up_transform_step(recipe_root: Path, transform_user_module):
     split_step_output_dir = recipe_root.joinpath("steps", "split", "outputs")
@@ -61,10 +74,8 @@ def test_transform_step_writes_onehot_encoded_dataframe_and_transformer_pkl(
 ):
     from sklearn.preprocessing import StandardScaler
 
-    m = Mock()
-    m.transformer_fn = lambda: StandardScaler()  # pylint: disable=unnecessary-lambda
     monkeypatch.setenv(MLFLOW_RECIPES_EXECUTION_DIRECTORY.name, str(tmp_recipe_root_path))
-    with mock.patch.dict("sys.modules", {"steps.transform": m}):
+    with mock.patch("steps.transform.transformer_fn", lambda: StandardScaler()):
         transform_step, transform_step_output_dir, _ = set_up_transform_step(
             tmp_recipe_root_path, "transformer_fn"
         )
diff --git a/tests/resources/data/dataset.py b/tests/resources/data/dataset.py
index 50b9b31d61c09..486e9809d5fdd 100644
--- a/tests/resources/data/dataset.py
+++ b/tests/resources/data/dataset.py
@@ -1,5 +1,4 @@
 import base64
-import hashlib
 import json
 from typing import Any, Dict, List, Optional
 
@@ -9,6 +8,7 @@
 from mlflow.data.dataset import Dataset
 from mlflow.types import Schema
 from mlflow.types.utils import _infer_schema
+from mlflow.utils import insecure_hash
 
 from tests.resources.data.dataset_source import TestDatasetSource
 
@@ -29,7 +29,7 @@ def _compute_digest(self) -> str:
         Computes a digest for the dataset. Called if the user doesn't supply
         a digest when constructing the dataset.
         """
-        hash_md5 = hashlib.md5()
+        hash_md5 = insecure_hash.md5()
         for hash_part in pd.util.hash_array(np.array(self._data_list)):
             hash_md5.update(hash_part)
         return base64.b64encode(hash_md5.digest()).decode("ascii")
diff --git a/tests/resources/db/latest_schema.sql b/tests/resources/db/latest_schema.sql
index 1c5f9d0715c35..2ec1661808255 100644
--- a/tests/resources/db/latest_schema.sql
+++ b/tests/resources/db/latest_schema.sql
@@ -82,15 +82,16 @@ CREATE TABLE model_versions (
 	status VARCHAR(20),
 	status_message VARCHAR(500),
 	run_link VARCHAR(500),
+	storage_location VARCHAR(500),
 	CONSTRAINT model_version_pk PRIMARY KEY (name, version),
 	FOREIGN KEY(name) REFERENCES registered_models (name) ON UPDATE CASCADE
 )
 
 
 CREATE TABLE registered_model_aliases (
-	name VARCHAR(256) NOT NULL,
 	alias VARCHAR(256) NOT NULL,
 	version INTEGER NOT NULL,
+	name VARCHAR(256) NOT NULL,
 	CONSTRAINT registered_model_alias_pk PRIMARY KEY (name, alias),
 	CONSTRAINT registered_model_alias_name_fkey FOREIGN KEY(name) REFERENCES registered_models (name) ON DELETE CASCADE ON UPDATE CASCADE
 )
@@ -122,8 +123,8 @@ CREATE TABLE runs (
 	deleted_time BIGINT,
 	CONSTRAINT run_pk PRIMARY KEY (run_uuid),
 	FOREIGN KEY(experiment_id) REFERENCES experiments (experiment_id),
-	CONSTRAINT source_type CHECK (source_type IN ('NOTEBOOK', 'JOB', 'LOCAL', 'UNKNOWN', 'PROJECT')),
 	CONSTRAINT runs_lifecycle_stage CHECK (lifecycle_stage IN ('active', 'deleted')),
+	CONSTRAINT source_type CHECK (source_type IN ('NOTEBOOK', 'JOB', 'LOCAL', 'UNKNOWN', 'PROJECT')),
 	CHECK (status IN ('SCHEDULED', 'FAILED', 'FINISHED', 'RUNNING', 'KILLED'))
 )
 
diff --git a/tests/sklearn/test_sklearn_model_export.py b/tests/sklearn/test_sklearn_model_export.py
index 44b68be160b99..98e0c8cf8b6c0 100644
--- a/tests/sklearn/test_sklearn_model_export.py
+++ b/tests/sklearn/test_sklearn_model_export.py
@@ -709,7 +709,7 @@ def test_sklearn_compatible_with_mlflow_2_4_0(sklearn_knn_model, tmp_path):
    - setuptools==56.0.0
    - wheel==0.40.0
 dependencies:
-   - -r requirements.txt    
+   - -r requirements.txt
 """
     )
     tmp_path.joinpath("requirements.txt").write_text(
diff --git a/tests/spark/test_spark_model_export.py b/tests/spark/test_spark_model_export.py
index ada83712ce04e..7793fbaf63542 100644
--- a/tests/spark/test_spark_model_export.py
+++ b/tests/spark/test_spark_model_export.py
@@ -72,21 +72,23 @@ def spark_custom_env(tmp_path):
 
 def _get_spark_session_with_retry(max_tries=3):
     conf = pyspark.SparkConf()
-    for num_tries in range(max_tries):
+    for attempt in range(max_tries):
         try:
             return get_spark_session(conf)
-        except Exception:
-            if num_tries >= max_tries - 1:
+        except Exception as e:
+            if attempt >= max_tries - 1:
                 raise
-            _logger.exception(f"Attempt {num_tries} to create a SparkSession failed, retrying...")
+            _logger.exception(
+                f"Attempt {attempt} to create a SparkSession failed ({e!r}), retrying..."
+            )
 
 
 # Specify `autouse=True` to ensure that a context is created
 # before any tests are executed. This ensures that the Hadoop filesystem
 # does not create its own SparkContext without the MLeap libraries required by
 # other tests.
-@pytest.fixture(scope="module", autouse=True)
-def spark_context():
+@pytest.fixture(scope="module")
+def spark():
     if Version(pyspark.__version__) < Version("3.1"):
         # A workaround for this issue:
         # https://stackoverflow.com/questions/62109276/errorjava-lang-unsupportedoperationexception-for-pyspark-pandas-udf-documenta
@@ -103,9 +105,9 @@ def spark_context():
 spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true"
 """
             f.write(conf)
-    spark = _get_spark_session_with_retry()
-    yield spark.sparkContext
-    spark.stop()
+
+    with _get_spark_session_with_retry() as spark:
+        yield spark
 
 
 def iris_pandas_df():
@@ -119,11 +121,10 @@ def iris_pandas_df():
 
 
 @pytest.fixture(scope="module")
-def iris_df(spark_context):
+def iris_df(spark):
     pdf = iris_pandas_df()
     feature_names = list(pdf.drop("label", axis=1).columns)
-    spark_session = pyspark.sql.SparkSession(spark_context)
-    iris_spark_df = spark_session.createDataFrame(pdf)
+    iris_spark_df = spark.createDataFrame(pdf)
     return feature_names, pdf, iris_spark_df
 
 
@@ -170,8 +171,7 @@ def spark_model_transformer(iris_df):
 
 
 @pytest.fixture(scope="module")
-def spark_model_estimator(iris_df, spark_context):
-    # pylint: disable=unused-argument
+def spark_model_estimator(iris_df):
     feature_names, iris_pandas_df, iris_spark_df = iris_df
     assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
     features_df = assembler.transform(iris_spark_df)
@@ -190,6 +190,7 @@ def model_path(tmp_path):
     return os.path.join(tmp_path, "model")
 
 
+@pytest.mark.usefixtures("spark")
 def test_hadoop_filesystem(tmp_path):
     # copy local dir to and back from HadoopFS and make sure the results match
     from mlflow.spark import _HadoopFileSystem as FS
diff --git a/tests/store/artifact/test_databricks_models_artifact_repo.py b/tests/store/artifact/test_databricks_models_artifact_repo.py
index 210090de2cd38..002feabcd69fd 100644
--- a/tests/store/artifact/test_databricks_models_artifact_repo.py
+++ b/tests/store/artifact/test_databricks_models_artifact_repo.py
@@ -1,5 +1,4 @@
 import json
-import re
 from unittest import mock
 from unittest.mock import ANY
 
@@ -13,7 +12,7 @@
     _DOWNLOAD_CHUNK_SIZE,
     DatabricksModelsArtifactRepository,
 )
-from mlflow.utils.file_utils import _ChunkDownloadError
+from mlflow.utils.file_utils import _Chunk
 
 DATABRICKS_MODEL_ARTIFACT_REPOSITORY_PACKAGE = (
     "mlflow.store.artifact.databricks_models_artifact_repo"
@@ -300,7 +299,7 @@ def test_parallelized_download_file_using_http_uri_with_error_downloads(
         "signed_uri": "https://my-amazing-signed-uri-to-rule-them-all.com/1234-numbers-yay-567",
         "headers": [{"name": "header_name", "value": "header_value"}],
     }
-    error_downloads = {1: _ChunkDownloadError(False, "Internal Server Error", 500)}
+    error_downloads = {_Chunk(1, 2, 3): Exception("Internal Server Error")}
 
     with mock.patch(
         DATABRICKS_MODEL_ARTIFACT_REPOSITORY + ".list_artifacts",
@@ -314,19 +313,21 @@ def test_parallelized_download_file_using_http_uri_with_error_downloads(
     ), mock.patch(
         DATABRICKS_MODEL_ARTIFACT_REPOSITORY_PACKAGE + ".parallelized_download_file_using_http_uri",
         return_value=error_downloads,
-    ):
-        with pytest.raises(
-            MlflowException,
-            match=re.compile(
-                (
-                    rf"Failed to download artifact {re.escape(remote_file_path)}:"
-                    r".+Internal Server Error"
-                ),
-                re.DOTALL,
-            ),
-        ):
+    ), mock.patch(
+        DATABRICKS_MODEL_ARTIFACT_REPOSITORY_PACKAGE + ".download_chunk",
+        side_effect=Exception("Retry failed"),
+    ) as mock_download_chunk:
+        with pytest.raises(MlflowException, match="Retry failed"):
             databricks_model_artifact_repo._download_file(remote_file_path, "")
 
+        mock_download_chunk.assert_called_once_with(
+            range_start=2,
+            range_end=3,
+            headers={"header_name": "header_value"},
+            download_path="",
+            http_uri="https://my-amazing-signed-uri-to-rule-them-all.com/1234-numbers-yay-567",
+        )
+
 
 @pytest.mark.parametrize(
     ("remote_file_path"),
@@ -342,7 +343,7 @@ def test_parallelized_download_file_using_http_uri_with_failed_downloads(
         "signed_uri": "https://my-amazing-signed-uri-to-rule-them-all.com/1234-numbers-yay-567",
         "headers": [{"name": "header_name", "value": "header_value"}],
     }
-    failed_downloads = {1: _ChunkDownloadError(True, "Unauthorized", 401)}
+    failed_downloads = {_Chunk(1, 2, 3): Exception("Internal Server Error")}
 
     with mock.patch(
         DATABRICKS_MODEL_ARTIFACT_REPOSITORY + ".list_artifacts",
diff --git a/tests/store/model_registry/test_file_store.py b/tests/store/model_registry/test_file_store.py
index 99a8a4b8149ef..ef12611c2ba9e 100644
--- a/tests/store/model_registry/test_file_store.py
+++ b/tests/store/model_registry/test_file_store.py
@@ -1512,3 +1512,70 @@ def predict(self, context, model_input, params=None):
         mv2 = store.search_model_versions("name = 'model2'", max_results=10)
         assert len(mv2) == 1
         assert mv2[0].name == "model2"
+
+
+@pytest.mark.parametrize("copy_to_same_model", [False, True])
+def test_copy_model_version(store, copy_to_same_model):
+    name1 = "test_for_copy_MV1"
+    store.create_registered_model(name1)
+    src_tags = [
+        ModelVersionTag("key", "value"),
+        ModelVersionTag("anotherKey", "some other value"),
+    ]
+    src_mv = _create_model_version(
+        store, name1, tags=src_tags, run_link="dummylink", description="test description"
+    )
+
+    # Make some changes to the src MV that won't be copied over
+    store.transition_model_version_stage(
+        name1, src_mv.version, "Production", archive_existing_versions=False
+    )
+
+    copy_rm_name = name1 if copy_to_same_model else "test_for_copy_MV2"
+    copy_mv_version = 2 if copy_to_same_model else 1
+    timestamp = time.time()
+    dst_mv = store.copy_model_version(src_mv, copy_rm_name)
+    assert dst_mv.name == copy_rm_name
+    assert dst_mv.version == copy_mv_version
+
+    copied_mv = store.get_model_version(dst_mv.name, dst_mv.version)
+    assert copied_mv.name == copy_rm_name
+    assert copied_mv.version == copy_mv_version
+    assert copied_mv.current_stage == "None"
+    assert copied_mv.creation_timestamp >= timestamp
+    assert copied_mv.last_updated_timestamp >= timestamp
+    assert copied_mv.description == "test description"
+    assert copied_mv.source == f"models:/{src_mv.name}/{src_mv.version}"
+    assert store.get_model_version_download_uri(dst_mv.name, dst_mv.version) == src_mv.source
+    assert copied_mv.run_link == "dummylink"
+    assert copied_mv.run_id == src_mv.run_id
+    assert copied_mv.status == "READY"
+    assert copied_mv.status_message is None
+    assert copied_mv.tags == {"key": "value", "anotherKey": "some other value"}
+
+    # Copy a model version copy
+    double_copy_mv = store.copy_model_version(copied_mv, "test_for_copy_MV3")
+    assert double_copy_mv.source == f"models:/{copied_mv.name}/{copied_mv.version}"
+    assert store.get_model_version_download_uri(dst_mv.name, dst_mv.version) == src_mv.source
+
+
+def test_writing_model_version_preserves_storage_location(store):
+    name = "test_storage_location_MV1"
+    source = "/special/source"
+    store.create_registered_model(name)
+    _create_model_version(store, name, source=source)
+    _create_model_version(store, name, source=source)
+
+    # Run through all the operations that modify model versions and make sure that the
+    # `storage_location` property is not dropped.
+    store.transition_model_version_stage(name, 1, "Production", archive_existing_versions=False)
+    assert store._fetch_file_model_version_if_exists(name, 1).storage_location == source
+    store.update_model_version(name, 1, description="test description")
+    assert store._fetch_file_model_version_if_exists(name, 1).storage_location == source
+    store.transition_model_version_stage(name, 1, "Production", archive_existing_versions=True)
+    assert store._fetch_file_model_version_if_exists(name, 1).storage_location == source
+    store.rename_registered_model(name, "test_storage_location_new")
+    assert (
+        store._fetch_file_model_version_if_exists("test_storage_location_new", 1).storage_location
+        == source
+    )
diff --git a/tests/store/model_registry/test_sqlalchemy_store.py b/tests/store/model_registry/test_sqlalchemy_store.py
index 1e3c52da79bf4..40d8ea3a7be61 100644
--- a/tests/store/model_registry/test_sqlalchemy_store.py
+++ b/tests/store/model_registry/test_sqlalchemy_store.py
@@ -1639,3 +1639,48 @@ def test_delete_model_deletes_alias(store):
         match=r"Registered model alias test_alias not found.",
     ):
         store.get_model_version_by_alias(model_name, "test_alias")
+
+
+@pytest.mark.parametrize("copy_to_same_model", [False, True])
+def test_copy_model_version(store, copy_to_same_model):
+    name1 = "test_for_copy_MV1"
+    store.create_registered_model(name1)
+    src_tags = [
+        ModelVersionTag("key", "value"),
+        ModelVersionTag("anotherKey", "some other value"),
+    ]
+    src_mv = _mv_maker(
+        store, name1, tags=src_tags, run_link="dummylink", description="test description"
+    )
+
+    # Make some changes to the src MV that won't be copied over
+    store.transition_model_version_stage(
+        name1, src_mv.version, "Production", archive_existing_versions=False
+    )
+
+    copy_rm_name = name1 if copy_to_same_model else "test_for_copy_MV2"
+    copy_mv_version = 2 if copy_to_same_model else 1
+    timestamp = time.time()
+    dst_mv = store.copy_model_version(src_mv, copy_rm_name)
+    assert dst_mv.name == copy_rm_name
+    assert dst_mv.version == copy_mv_version
+
+    copied_mv = store.get_model_version(dst_mv.name, dst_mv.version)
+    assert copied_mv.name == copy_rm_name
+    assert copied_mv.version == copy_mv_version
+    assert copied_mv.current_stage == "None"
+    assert copied_mv.creation_timestamp >= timestamp
+    assert copied_mv.last_updated_timestamp >= timestamp
+    assert copied_mv.description == "test description"
+    assert copied_mv.source == f"models:/{src_mv.name}/{src_mv.version}"
+    assert store.get_model_version_download_uri(dst_mv.name, dst_mv.version) == src_mv.source
+    assert copied_mv.run_link == "dummylink"
+    assert copied_mv.run_id == src_mv.run_id
+    assert copied_mv.status == "READY"
+    assert copied_mv.status_message is None
+    assert copied_mv.tags == {"key": "value", "anotherKey": "some other value"}
+
+    # Copy a model version copy
+    double_copy_mv = store.copy_model_version(copied_mv, "test_for_copy_MV3")
+    assert double_copy_mv.source == f"models:/{copied_mv.name}/{copied_mv.version}"
+    assert store.get_model_version_download_uri(dst_mv.name, dst_mv.version) == src_mv.source
diff --git a/tests/store/tracking/test_file_store.py b/tests/store/tracking/test_file_store.py
index 2f205dcf6ddd0..98407a580d20e 100644
--- a/tests/store/tracking/test_file_store.py
+++ b/tests/store/tracking/test_file_store.py
@@ -1,4 +1,3 @@
-import hashlib
 import json
 import os
 import posixpath
@@ -38,6 +37,7 @@
 from mlflow.store.entities.paged_list import PagedList
 from mlflow.store.tracking import SEARCH_MAX_RESULTS_DEFAULT
 from mlflow.store.tracking.file_store import FileStore
+from mlflow.utils import insecure_hash
 from mlflow.utils.file_utils import TempDir, path_to_local_file_uri, read_yaml, write_yaml
 from mlflow.utils.mlflow_tags import MLFLOW_DATASET_CONTEXT, MLFLOW_LOGGED_MODELS, MLFLOW_RUN_NAME
 from mlflow.utils.name_utils import _EXPERIMENT_ID_FIXED_WIDTH, _GENERATOR_PREDICATES
@@ -2493,7 +2493,7 @@ def assert_expected_input_storage_ids_present(run, dataset_storage_ids):
         inputs_dir = os.path.join(run_dir, FileStore.INPUTS_FOLDER_NAME)
         expected_input_storage_ids = []
         for dataset_storage_id in dataset_storage_ids:
-            md5 = hashlib.md5(dataset_storage_id.encode("utf-8"))
+            md5 = insecure_hash.md5(dataset_storage_id.encode("utf-8"))
             md5.update(run.info.run_id.encode("utf-8"))
             expected_input_storage_ids.append(md5.hexdigest())
         assert set(os.listdir(inputs_dir)) == set(expected_input_storage_ids)
diff --git a/tests/store/tracking/test_sqlalchemy_store.py b/tests/store/tracking/test_sqlalchemy_store.py
index c88cf6b444213..e4226c126dc3e 100644
--- a/tests/store/tracking/test_sqlalchemy_store.py
+++ b/tests/store/tracking/test_sqlalchemy_store.py
@@ -2969,23 +2969,23 @@ def test_insert_large_text_in_dataset_table(self):
             conn.execute(
                 sqlalchemy.sql.text(
                     f"""
-                INSERT INTO datasets 
-                    (dataset_uuid, 
-                    experiment_id, 
-                    name, 
-                    digest, 
-                    dataset_source_type, 
-                    dataset_source, 
-                    dataset_schema, 
+                INSERT INTO datasets
+                    (dataset_uuid,
+                    experiment_id,
+                    name,
+                    digest,
+                    dataset_source_type,
+                    dataset_source,
+                    dataset_schema,
                     dataset_profile)
-                VALUES 
-                    ('test_uuid', 
-                    0, 
-                    'test_name', 
-                    'test_digest', 
-                    'test_source_type', 
+                VALUES
+                    ('test_uuid',
+                    0,
+                    'test_name',
+                    'test_digest',
+                    'test_source_type',
                     '{dataset_source}', '
-                    test_schema', 
+                    test_schema',
                     '{dataset_profile}')
                 """
                 )
diff --git a/tests/tracking/context/test_git_context.py b/tests/tracking/context/test_git_context.py
index 6097dc8cd8d06..d93c30437ba11 100644
--- a/tests/tracking/context/test_git_context.py
+++ b/tests/tracking/context/test_git_context.py
@@ -25,6 +25,7 @@ def patch_script_name():
 def patch_git_repo():
     mock_repo = mock.Mock()
     mock_repo.head.commit.hexsha = MOCK_COMMIT_HASH
+    mock_repo.ignored.return_value = []
     with mock.patch("git.Repo", return_value=mock_repo):
         yield mock_repo
 
@@ -45,13 +46,9 @@ def test_git_run_context_tags(patch_script_name, patch_git_repo):
 def test_git_run_context_caching(patch_script_name):
     """Check that the git commit hash is only looked up once."""
 
-    mock_repo = mock.Mock()
-    mock_hexsha = mock.PropertyMock(return_value=MOCK_COMMIT_HASH)
-    type(mock_repo.head.commit).hexsha = mock_hexsha
-
-    with mock.patch("git.Repo", return_value=mock_repo):
+    with mock.patch("git.Repo") as mock_repo:
         context = GitRunContext()
         context.in_context()
         context.tags()
 
-    assert mock_hexsha.call_count == 1
+    mock_repo.assert_called_once()
diff --git a/tests/tracking/fluent/test_fluent.py b/tests/tracking/fluent/test_fluent.py
index f7dc5e9371917..f0716de5814b8 100644
--- a/tests/tracking/fluent/test_fluent.py
+++ b/tests/tracking/fluent/test_fluent.py
@@ -1289,3 +1289,83 @@ def test_get_parent_run():
     assert parent_run.data.params == {"a": "1"}
 
     assert mlflow.get_parent_run(run_id) is None
+
+
+def test_log_metric_async():
+    run_operations = []
+
+    with mlflow.start_run() as parent:
+        for num in range(100):
+            run_operations.append(
+                mlflow.log_metric("async single metric", step=num, value=num, synchronous=False)
+            )
+        metrics = {f"async batch metric {num}": num for num in range(100)}
+        run_operations.append(mlflow.log_metrics(metrics=metrics, step=1, synchronous=False))
+
+    for run_operation in run_operations:
+        run_operation.wait()
+    parent_run = mlflow.get_run(parent.info.run_id)
+    assert parent_run.info.run_id == parent.info.run_id
+    assert parent_run.data.metrics["async single metric"] == 99
+    for num in range(100):
+        assert parent_run.data.metrics[f"async batch metric {num}"] == num
+
+
+def test_log_metric_async_throws():
+    with mlflow.start_run():
+        with pytest.raises(MlflowException, match="Please specify value as a valid double"):
+            mlflow.log_metric(
+                "async single metric", step=1, value="single metric value", synchronous=False
+            ).wait()
+
+        with pytest.raises(MlflowException, match="Please specify value as a valid double"):
+            mlflow.log_metrics(
+                metrics={f"async batch metric {num}": "batch metric value" for num in range(2)},
+                step=1,
+                synchronous=False,
+            ).wait()
+
+
+def test_log_param_async():
+    run_operations = []
+
+    with mlflow.start_run() as parent:
+        run_operations.append(mlflow.log_param("async single param", value="1", synchronous=False))
+        params = {f"async batch param {num}": num for num in range(100)}
+        run_operations.append(mlflow.log_params(params=params, synchronous=False))
+
+    for run_operation in run_operations:
+        run_operation.wait()
+    parent_run = mlflow.get_run(parent.info.run_id)
+    assert parent_run.info.run_id == parent.info.run_id
+    assert parent_run.data.params["async single param"] == "1"
+    for num in range(100):
+        assert parent_run.data.params[f"async batch param {num}"] == str(num)
+
+
+def test_log_param_async_throws():
+    with mlflow.start_run():
+        mlflow.log_param("async single param", value="1", synchronous=False)
+        with pytest.raises(MlflowException, match="Changing param values is not allowed"):
+            mlflow.log_param("async single param", value="2", synchronous=False).wait()
+
+        mlflow.log_params({"async batch param": "2"}, synchronous=False)
+        with pytest.raises(MlflowException, match="Changing param values is not allowed"):
+            mlflow.log_params({"async batch param": "3"}, synchronous=False).wait()
+
+
+def test_set_tag_async():
+    run_operations = []
+
+    with mlflow.start_run() as parent:
+        run_operations.append(mlflow.set_tag("async single tag", value="1", synchronous=False))
+        tags = {f"async batch tag {num}": num for num in range(100)}
+        run_operations.append(mlflow.set_tags(tags=tags, synchronous=False))
+
+    for run_operation in run_operations:
+        run_operation.wait()
+    parent_run = mlflow.get_run(parent.info.run_id)
+    assert parent_run.info.run_id == parent.info.run_id
+    assert parent_run.data.tags["async single tag"] == "1"
+    for num in range(100):
+        assert parent_run.data.tags[f"async batch tag {num}"] == str(num)
diff --git a/tests/tracking/request_header/test_registry.py b/tests/tracking/request_header/test_registry.py
index 6eba4f701e207..f2295b5caf528 100644
--- a/tests/tracking/request_header/test_registry.py
+++ b/tests/tracking/request_header/test_registry.py
@@ -15,6 +15,12 @@
 # pylint: disable=unused-argument
 
 
+@pytest.fixture(autouse=True)
+def reload_registry():
+    yield
+    reload(mlflow.tracking.request_header.registry)
+
+
 def test_request_header_context_provider_registry_register():
     provider_class = mock.Mock()
 
diff --git a/tests/tracking/test_client.py b/tests/tracking/test_client.py
index 571142b55d821..d1cf1499d78a8 100644
--- a/tests/tracking/test_client.py
+++ b/tests/tracking/test_client.py
@@ -5,8 +5,10 @@
 
 from mlflow import MlflowClient
 from mlflow.entities import ExperimentTag, Run, RunInfo, RunStatus, RunTag, SourceType, ViewType
+from mlflow.entities.metric import Metric
 from mlflow.entities.model_registry import ModelVersion, ModelVersionTag
 from mlflow.entities.model_registry.model_version_status import ModelVersionStatus
+from mlflow.entities.param import Param
 from mlflow.exceptions import MlflowException
 from mlflow.store.model_registry.sqlalchemy_store import (
     SqlAlchemyStore as SqlAlchemyModelRegistryStore,
@@ -30,6 +32,12 @@
 )
 
 
+@pytest.fixture(autouse=True)
+def reset_registry_uri():
+    yield
+    set_registry_uri(None)
+
+
 @pytest.fixture
 def mock_store():
     with mock.patch("mlflow.tracking._tracking_service.utils._get_store") as mock_get_store:
@@ -765,3 +773,38 @@ def test_update_run(mock_store):
         end_time=mock.ANY,
         run_name="my name",
     )
+
+
+def test_client_log_metric_params_tags_overrides(mock_store):
+    experiment_id = mock.Mock()
+    start_time = mock.Mock()
+    run_name = mock.Mock()
+    run = MlflowClient().create_run(experiment_id, start_time, tags={}, run_name=run_name)
+    run_id = run.info.run_id
+
+    run_operation = MlflowClient().log_metric(run_id, "m1", 0.87, 123456789, 1, synchronous=False)
+    run_operation.wait()
+
+    run_operation = MlflowClient().log_param(run_id, "p1", "pv1", synchronous=False)
+    run_operation.wait()
+
+    run_operation = MlflowClient().set_tag(run_id, "t1", "tv1", synchronous=False)
+    run_operation.wait()
+
+    mock_store.log_metric_async.assert_called_once_with(run_id, Metric("m1", 0.87, 123456789, 1))
+    mock_store.log_param_async.assert_called_once_with(run_id, Param("p1", "pv1"))
+    mock_store.set_tag_async.assert_called_once_with(run_id, RunTag("t1", "tv1"))
+
+    mock_store.reset_mock()
+
+    # log_batch_async
+    MlflowClient().create_run(experiment_id, start_time, {})
+    metrics = [Metric("m1", 0.87, 123456789, 1), Metric("m2", 0.87, 123456789, 1)]
+    tags = [RunTag("t1", "tv1"), RunTag("t2", "tv2")]
+    params = [Param("p1", "pv1"), Param("p2", "pv2")]
+    run_operation = MlflowClient().log_batch(run_id, metrics, params, tags, synchronous=False)
+    run_operation.wait()
+
+    mock_store.log_batch_async.assert_called_once_with(
+        run_id=run_id, metrics=metrics, params=params, tags=tags
+    )
diff --git a/tests/transformers/test_transformers_model_export.py b/tests/transformers/test_transformers_model_export.py
index cca0f080e9e09..e3284bede05cf 100644
--- a/tests/transformers/test_transformers_model_export.py
+++ b/tests/transformers/test_transformers_model_export.py
@@ -46,6 +46,7 @@
     _get_instance_type,
     _get_or_infer_task_type,
     _infer_transformers_task_type,
+    _is_model_distributed_in_memory,
     _record_pipeline_components,
     _should_add_pyfunc_to_model,
     _TransformersModel,
@@ -1644,6 +1645,17 @@ def test_fill_mask_pipeline(fill_mask_pipeline, model_path, inference_payload, r
     assert pd_inference == result
 
 
+def test_fill_mask_pipeline_with_multiple_masks(fill_mask_pipeline, model_path):
+    data = ["I <mask> the whole <mask> of <mask>", "I <mask> the whole <mask> of <mask>"]
+
+    mlflow.transformers.save_model(fill_mask_pipeline, path=model_path)
+    pyfunc_loaded = mlflow.pyfunc.load_model(model_path)
+
+    inference = pyfunc_loaded.predict(data)
+    assert len(inference) == 2
+    assert all(len(value) == 3 for value in inference)
+
+
 @pytest.mark.parametrize(
     "invalid_data",
     [
@@ -3787,3 +3799,69 @@ def predict(self, context, model_input, params=None):
                 python_model=TestModel(),
                 artifacts={"some-model": "hf:/invalid-repo-id"},
             )
+
+
+def test_model_distributed_across_devices():
+    mock_model = mock.Mock()
+    mock_model.device.type = "meta"
+    mock_model.hf_device_map = {
+        "layer1": mock.Mock(type="cpu"),
+        "layer2": mock.Mock(type="cpu"),
+        "layer3": mock.Mock(type="gpu"),
+        "layer4": mock.Mock(type="disk"),
+    }
+
+    assert _is_model_distributed_in_memory(mock_model)
+
+
+def test_model_on_single_device():
+    mock_model = mock.Mock()
+    mock_model.device.type = "cpu"
+    mock_model.hf_device_map = {}
+
+    assert not _is_model_distributed_in_memory(mock_model)
+
+
+def test_basic_model_with_accelerate_device_mapping_fails_save(tmp_path):
+    task = "translation_en_to_de"
+    architecture = "t5-small"
+    model = transformers.T5ForConditionalGeneration.from_pretrained(
+        pretrained_model_name_or_path=architecture,
+        device_map={"shared": "cpu", "encoder": "cpu", "decoder": "disk", "lm_head": "disk"},
+        offload_folder=str(tmp_path / "weights"),
+        low_cpu_mem_usage=True,
+    )
+
+    tokenizer = transformers.T5TokenizerFast.from_pretrained(
+        pretrained_model_name_or_path=architecture, model_max_length=100
+    )
+    pipeline = transformers.pipeline(task=task, model=model, tokenizer=tokenizer)
+
+    with pytest.raises(
+        MlflowException,
+        match="The model that is attempting to be saved has been loaded into memory",
+    ):
+        mlflow.transformers.save_model(transformers_model=pipeline, path=str(tmp_path / "model"))
+
+
+def test_basic_model_with_accelerate_homogeneous_mapping_works(tmp_path):
+    task = "translation_en_to_de"
+    architecture = "t5-small"
+    model = transformers.T5ForConditionalGeneration.from_pretrained(
+        pretrained_model_name_or_path=architecture,
+        device_map={"shared": "cpu", "encoder": "cpu", "decoder": "cpu", "lm_head": "cpu"},
+        low_cpu_mem_usage=True,
+    )
+
+    tokenizer = transformers.T5TokenizerFast.from_pretrained(
+        pretrained_model_name_or_path=architecture, model_max_length=100
+    )
+    pipeline = transformers.pipeline(task=task, model=model, tokenizer=tokenizer)
+
+    mlflow.transformers.save_model(transformers_model=pipeline, path=str(tmp_path / "model"))
+
+    loaded = mlflow.transformers.load_model(str(tmp_path / "model"))
+
+    text = "Apples are delicious"
+
+    assert loaded(text) == pipeline(text)
diff --git a/tests/types/test_schema.py b/tests/types/test_schema.py
index 79c7538ca403d..ecde1398aac85 100644
--- a/tests/types/test_schema.py
+++ b/tests/types/test_schema.py
@@ -632,10 +632,12 @@ def test_spark_type_mapping(pandas_df_with_all_types):
     )
     actual_spark_schema = schema.as_spark_schema()
     assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
-    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
-    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=actual_spark_schema)
-    schema2 = _infer_schema(sparkdf)
-    assert schema == schema2
+    with pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate()) as spark_session:
+        sparkdf = spark_session.createDataFrame(
+            pandas_df_with_all_types, schema=actual_spark_schema
+        )
+        schema2 = _infer_schema(sparkdf)
+        assert schema == schema2
 
     # test unnamed columns
     schema = Schema([ColSpec(col.type) for col in schema.inputs])
diff --git a/tests/utils/test_async_logging_queue.py b/tests/utils/test_async_logging_queue.py
new file mode 100644
index 0000000000000..553bb4b059a14
--- /dev/null
+++ b/tests/utils/test_async_logging_queue.py
@@ -0,0 +1,306 @@
+import io
+import pickle
+import random
+import threading
+import time
+import uuid
+
+import pytest
+
+from mlflow import MlflowException
+from mlflow.entities.metric import Metric
+from mlflow.entities.param import Param
+from mlflow.entities.run_tag import RunTag
+from mlflow.utils.async_logging.async_logging_queue import AsyncLoggingQueue
+
+METRIC_PER_BATCH = 250
+TAGS_PER_BATCH = 1
+PARAMS_PER_BATCH = 1
+TOTAL_BATCHES = 5
+
+
+class RunData:
+    def __init__(self, throw_exception_on_batch_number=None) -> None:
+        if throw_exception_on_batch_number is None:
+            throw_exception_on_batch_number = []
+        self.received_run_id = ""
+        self.received_metrics = []
+        self.received_tags = []
+        self.received_params = []
+        self.batch_count = 0
+        self.throw_exception_on_batch_number = (
+            throw_exception_on_batch_number if throw_exception_on_batch_number else []
+        )
+
+    def consume_queue_data(self, run_id, metrics, tags, params):
+        self.batch_count += 1
+        if self.batch_count in self.throw_exception_on_batch_number:
+            raise MlflowException("Failed to log run data")
+        self.received_run_id = run_id
+        self.received_metrics.extend(metrics or [])
+        self.received_params.extend(params or [])
+        self.received_tags.extend(tags or [])
+
+
+def test_single_thread_publish_consume_queue():
+    run_id = "test_run_id"
+    run_data = RunData()
+    async_logging_queue = AsyncLoggingQueue(run_data.consume_queue_data)
+    async_logging_queue.activate()
+    metrics_sent = []
+    tags_sent = []
+    params_sent = []
+
+    run_operations = []
+    for params, tags, metrics in _get_run_data():
+        run_operations.append(
+            async_logging_queue.log_batch_async(
+                run_id=run_id, metrics=metrics, tags=tags, params=params
+            )
+        )
+        metrics_sent += metrics
+        tags_sent += tags
+        params_sent += params
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    _assert_sent_received_data(
+        metrics_sent,
+        params_sent,
+        tags_sent,
+        run_data.received_metrics,
+        run_data.received_params,
+        run_data.received_tags,
+    )
+
+
+def test_queue_activation():
+    run_id = "test_run_id"
+    run_data = RunData()
+    async_logging_queue = AsyncLoggingQueue(run_data.consume_queue_data)
+
+    assert not async_logging_queue._is_activated
+
+    metrics = [
+        Metric(
+            key=f"batch metrics async-{val}",
+            value=val,
+            timestamp=val,
+            step=0,
+        )
+        for val in range(METRIC_PER_BATCH)
+    ]
+    with pytest.raises(MlflowException, match="AsyncLoggingQueue is not activated."):
+        async_logging_queue.log_batch_async(run_id=run_id, metrics=metrics, tags=[], params=[])
+
+    async_logging_queue.activate()
+    assert async_logging_queue._is_activated
+
+
+def test_partial_logging_failed():
+    run_id = "test_run_id"
+    run_data = RunData(throw_exception_on_batch_number=[3, 4])
+
+    async_logging_queue = AsyncLoggingQueue(run_data.consume_queue_data)
+    async_logging_queue.activate()
+
+    metrics_sent = []
+    tags_sent = []
+    params_sent = []
+
+    run_operations = []
+    batch_id = 1
+    for params, tags, metrics in _get_run_data():
+        if batch_id in [3, 4]:
+            with pytest.raises(MlflowException, match="Failed to log run data"):
+                async_logging_queue.log_batch_async(
+                    run_id=run_id, metrics=metrics, tags=tags, params=params
+                ).wait()
+        else:
+            run_operations.append(
+                async_logging_queue.log_batch_async(
+                    run_id=run_id, metrics=metrics, tags=tags, params=params
+                )
+            )
+            metrics_sent += metrics
+            tags_sent += tags
+            params_sent += params
+
+        batch_id += 1
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    _assert_sent_received_data(
+        metrics_sent,
+        params_sent,
+        tags_sent,
+        run_data.received_metrics,
+        run_data.received_params,
+        run_data.received_tags,
+    )
+
+
+def test_publish_multithread_consume_single_thread():
+    run_id = "test_run_id"
+    run_data = RunData(throw_exception_on_batch_number=[])
+
+    async_logging_queue = AsyncLoggingQueue(run_data.consume_queue_data)
+    async_logging_queue.activate()
+
+    run_operations = []
+    t1 = threading.Thread(
+        target=_send_metrics_tags_params, args=(async_logging_queue, run_id, run_operations)
+    )
+    t2 = threading.Thread(
+        target=_send_metrics_tags_params, args=(async_logging_queue, run_id, run_operations)
+    )
+
+    t1.start()
+    t2.start()
+    t1.join()
+    t2.join()
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    assert len(run_data.received_metrics) == 2 * METRIC_PER_BATCH * TOTAL_BATCHES
+    assert len(run_data.received_tags) == 2 * TAGS_PER_BATCH * TOTAL_BATCHES
+    assert len(run_data.received_params) == 2 * PARAMS_PER_BATCH * TOTAL_BATCHES
+
+
+class Consumer:
+    def __init__(self) -> None:
+        self.metrics = []
+        self.tags = []
+        self.params = []
+
+    def consume_queue_data(self, run_id, metrics, tags, params):
+        time.sleep(0.5)
+        self.metrics.extend(metrics or [])
+        self.params.extend(params or [])
+        self.tags.extend(tags or [])
+
+
+def test_async_logging_queue_pickle():
+    run_id = "test_run_id"
+    consumer = Consumer()
+    async_logging_queue = AsyncLoggingQueue(consumer.consume_queue_data)
+
+    # Pickle the queue without activating it.
+    buffer = io.BytesIO()
+    pickle.dump(async_logging_queue, buffer)
+    deserialized_queue = pickle.loads(buffer.getvalue())  # Type: AsyncLoggingQueue
+
+    # activate the queue and then try to pickle it
+    async_logging_queue.activate()
+
+    run_operations = []
+    for val in range(0, 10):
+        run_operations.append(
+            async_logging_queue.log_batch_async(
+                run_id=run_id,
+                metrics=[Metric("metric", val, timestamp=time.time(), step=1)],
+                tags=[],
+                params=[],
+            )
+        )
+
+    assert not async_logging_queue._queue.empty()
+
+    # Pickle the queue
+    buffer = io.BytesIO()
+    pickle.dump(async_logging_queue, buffer)
+
+    deserialized_queue = pickle.loads(buffer.getvalue())  # Type: AsyncLoggingQueue
+    assert deserialized_queue._queue.empty()
+    assert deserialized_queue._lock is not None
+    assert deserialized_queue._is_activated is False
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    assert len(consumer.metrics) == 10
+
+    # try to log using deserialized queue after activating it.
+    deserialized_queue.activate()
+    assert deserialized_queue._is_activated
+
+    run_operations = []
+
+    for val in range(0, 10):
+        run_operations.append(
+            deserialized_queue.log_batch_async(
+                run_id=run_id,
+                metrics=[Metric("metric", val, timestamp=time.time(), step=1)],
+                tags=[],
+                params=[],
+            )
+        )
+
+    for run_operation in run_operations:
+        run_operation.wait()
+
+    assert len(deserialized_queue._logging_func.__self__.metrics) == 10
+
+
+def _send_metrics_tags_params(run_data_queueing_processor, run_id, run_operations=None):
+    if run_operations is None:
+        run_operations = []
+    metrics_sent = []
+    tags_sent = []
+    params_sent = []
+
+    for params, tags, metrics in _get_run_data():
+        run_operations.append(
+            run_data_queueing_processor.log_batch_async(
+                run_id=run_id, metrics=metrics, tags=tags, params=params
+            )
+        )
+
+        time.sleep(random.randint(1, 3))
+        metrics_sent += metrics
+        tags_sent += tags
+        params_sent += params
+
+
+def _get_run_data(total_batches=TOTAL_BATCHES):
+    for num in range(0, total_batches):
+        guid8 = str(uuid.uuid4())[:8]
+        params = [
+            Param(f"batch param-{guid8}-{val}", value=str(time.time()))
+            for val in range(PARAMS_PER_BATCH)
+        ]
+        tags = [
+            RunTag(f"batch tag-{guid8}-{val}", value=str(time.time()))
+            for val in range(TAGS_PER_BATCH)
+        ]
+        metrics = [
+            Metric(
+                key=f"batch metrics async-{num}",
+                value=val,
+                timestamp=int(time.time() * 1000),
+                step=0,
+            )
+            for val in range(METRIC_PER_BATCH)
+        ]
+        yield params, tags, metrics
+
+
+def _assert_sent_received_data(
+    metrics_sent, params_sent, tags_sent, received_metrics, received_params, received_tags
+):
+    for num in range(1, len(metrics_sent)):
+        assert metrics_sent[num].key == received_metrics[num].key
+        assert metrics_sent[num].value == received_metrics[num].value
+        assert metrics_sent[num].timestamp == received_metrics[num].timestamp
+        assert metrics_sent[num].step == received_metrics[num].step
+
+    for num in range(1, len(tags_sent)):
+        assert tags_sent[num].key == received_tags[num].key
+        assert tags_sent[num].value == received_tags[num].value
+
+    for num in range(1, len(params_sent)):
+        assert params_sent[num].key == received_params[num].key
+        assert params_sent[num].value == received_params[num].value
diff --git a/tests/utils/test_credentials.py b/tests/utils/test_credentials.py
index f3701583cb1a6..2e05dd5fd8a65 100644
--- a/tests/utils/test_credentials.py
+++ b/tests/utils/test_credentials.py
@@ -104,6 +104,7 @@ def test_read_mlflow_creds_env_takes_precedence_over_file(tmp_path, monkeypatch)
 
 
 def test_mlflow_login(tmp_path, monkeypatch):
+    # Mock `input()` and `getpass()` to return host, username and password in order.
     with patch(
         "builtins.input", side_effect=["https://community.cloud.databricks.com/", "dummyusername"]
     ), patch("getpass.getpass", side_effect=["dummypassword"]):
@@ -113,12 +114,12 @@ def test_mlflow_login(tmp_path, monkeypatch):
         monkeypatch.setenv("DATABRICKS_CONFIG_PROFILE", profile)
 
         class FakeWorkspaceClient:
-            class FakeUser:
-                def me(self):
-                    return ["dummyusername"]
+            class FakeClusters:
+                def list(self):
+                    return ["dummy_cluster"]
 
             def __init__(self):
-                self.current_user = FakeWorkspaceClient.FakeUser()
+                self.clusters = FakeWorkspaceClient.FakeClusters()
 
         with patch(
             "databricks.sdk.WorkspaceClient",
diff --git a/tests/utils/test_requirements_utils.py b/tests/utils/test_requirements_utils.py
index 1dc2b977a17ec..2098e22fea130 100644
--- a/tests/utils/test_requirements_utils.py
+++ b/tests/utils/test_requirements_utils.py
@@ -8,6 +8,7 @@
 
 import mlflow
 import mlflow.utils.requirements_utils
+from mlflow.utils.environment import infer_pip_requirements
 from mlflow.utils.requirements_utils import (
     _capture_imported_modules,
     _get_installed_version,
@@ -410,3 +411,24 @@ def predict(self, context, model_input, params=None):
     captured_modules = _capture_imported_modules(model_info.model_uri, "pyfunc")
     assert "pandas" in captured_modules
     assert "sklearn" in captured_modules
+
+
+def test_capture_imported_modules_includes_gateway_extra():
+    class MyModel(mlflow.pyfunc.PythonModel):
+        def predict(self, _, inputs, params=None):
+            import mlflow.gateway  # noqa: F401
+
+            return inputs
+
+    with mlflow.start_run():
+        model_info = mlflow.pyfunc.log_model(
+            python_model=MyModel(),
+            artifact_path="test_model",
+            input_example=([1, 2, 3]),
+        )
+
+    captured_modules = _capture_imported_modules(model_info.model_uri, "pyfunc")
+    assert "mlflow.gateway" in captured_modules
+
+    pip_requirements = infer_pip_requirements(model_info.model_uri, "pyfunc")
+    assert f"mlflow[gateway]=={mlflow.__version__}" in pip_requirements