diff --git a/packages/tasks/src/tasks/depth-estimation/data.ts b/packages/tasks/src/tasks/depth-estimation/data.ts index 379a4b321..1fd08f25c 100644 --- a/packages/tasks/src/tasks/depth-estimation/data.ts +++ b/packages/tasks/src/tasks/depth-estimation/data.ts @@ -24,14 +24,16 @@ const taskData: TaskDataCustom = { metrics: [], models: [ { - // TO DO: write description description: "Strong Depth Estimation model trained on 1.4 million images.", id: "Intel/dpt-large", }, { - // TO DO: write description description: "Strong Depth Estimation model trained on the KITTI dataset.", - id: "vinvino02/glpn-kitti", + id: "facebook/dpt-dinov2-large-kitti", + }, + { + description: "A strong monocular depth estimation model.", + id: "Bingxin/Marigold", }, ], spaces: [ diff --git a/packages/tasks/src/tasks/document-question-answering/data.ts b/packages/tasks/src/tasks/document-question-answering/data.ts index c80679216..8cc5b311d 100644 --- a/packages/tasks/src/tasks/document-question-answering/data.ts +++ b/packages/tasks/src/tasks/document-question-answering/data.ts @@ -50,6 +50,10 @@ const taskData: TaskDataCustom = { description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.", id: "naver-clova-ix/donut-base-finetuned-docvqa", }, + { + description: "A powerful model for document question answering.", + id: "google/pix2struct-docvqa-large", + }, ], spaces: [ { @@ -60,6 +64,10 @@ const taskData: TaskDataCustom = { description: "An application that can answer questions from invoices.", id: "impira/invoices", }, + { + description: "An application to compare different document question answering models.", + id: "merve/compare_docvqa_models", + }, ], summary: "Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.", diff --git a/packages/tasks/src/tasks/image-to-text/data.ts b/packages/tasks/src/tasks/image-to-text/data.ts index 1f9514675..46d19b6f0 100644 --- a/packages/tasks/src/tasks/image-to-text/data.ts +++ b/packages/tasks/src/tasks/image-to-text/data.ts @@ -32,30 +32,22 @@ const taskData: TaskDataCustom = { models: [ { description: "A robust image captioning model.", - id: "Salesforce/blip-image-captioning-large", + id: "Salesforce/blip2-opt-2.7b", }, { - description: "A strong image captioning model.", - id: "nlpconnect/vit-gpt2-image-captioning", + description: "A powerful and accurate image-to-text model that can also localize concepts in images.", + id: "microsoft/kosmos-2-patch14-224", }, { description: "A strong optical character recognition model.", - id: "microsoft/trocr-base-printed", - }, - { - description: "A strong visual question answering model for scientific diagrams.", - id: "google/pix2struct-ai2d-base", - }, - { - description: "A strong captioning model for UI components.", - id: "google/pix2struct-widget-captioning-base", - }, - { - description: "A captioning model for images that contain text.", - id: "google/pix2struct-textcaps-base", + id: "facebook/nougat-base", }, ], spaces: [ + { + description: "An application that compares various image captioning models.", + id: "nielsr/comparing-captioning-models", + }, { description: "A robust image captioning application.", id: "flax-community/image-captioning", diff --git a/packages/tasks/src/tasks/object-detection/data.ts b/packages/tasks/src/tasks/object-detection/data.ts index dea166a9a..76c9c9986 100644 --- a/packages/tasks/src/tasks/object-detection/data.ts +++ b/packages/tasks/src/tasks/object-detection/data.ts @@ -40,7 +40,6 @@ const taskData: TaskDataCustom = { ], models: [ { - // TO DO: write description description: "Solid object detection model trained on the benchmark dataset COCO 2017.", id: "facebook/detr-resnet-50", }, @@ -50,9 +49,13 @@ const taskData: TaskDataCustom = { }, ], spaces: [ + { + description: "Leaderboard to compare various object detection models across several metrics.", + id: "hf-vision/object_detection_leaderboard", + }, { description: "An object detection application that can detect unseen objects out of the box.", - id: "adirik/OWL-ViT", + id: "merve/owlv2", }, { description: "An application that contains various object detection models to try from.", diff --git a/packages/tasks/src/tasks/text-to-image/data.ts b/packages/tasks/src/tasks/text-to-image/data.ts index 872467726..4958765fa 100644 --- a/packages/tasks/src/tasks/text-to-image/data.ts +++ b/packages/tasks/src/tasks/text-to-image/data.ts @@ -45,14 +45,12 @@ const taskData: TaskDataCustom = { ], models: [ { - description: - "A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.", - id: "CompVis/stable-diffusion-v1-4", + description: "One of the most powerful image generation models that can generate realistic outputs.", + id: "stabilityai/stable-diffusion-xl-base-1.0", }, { - description: - "A model that can be used to generate images based on text prompts. The DALLĀ·E Mega model is the largest version of DALLE Mini.", - id: "dalle-mini/dalle-mega", + description: "A powerful yet fast image generation model.", + id: "latent-consistency/lcm-lora-sdxl", }, { description: "A text-to-image model that can generate coherent text inside image.", @@ -69,19 +67,23 @@ const taskData: TaskDataCustom = { id: "stabilityai/stable-diffusion", }, { - description: "An text-to-image application that can generate coherent text inside the image.", + description: "A text-to-image application to generate comics.", + id: "jbilcke-hf/ai-comic-factory", + }, + { + description: "A text-to-image application that can generate coherent text inside the image.", id: "DeepFloyd/IF", }, { - description: "An powerful text-to-image application that can generate images.", - id: "kakaobrain/karlo", + description: "A powerful yet very fast image generation application.", + id: "latent-consistency/lcm-lora-for-sdxl", }, { - description: "An powerful text-to-image application that can generates 3D representations.", + description: "A powerful text-to-image application that can generate 3D representations.", id: "hysts/Shap-E", }, { - description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.", + description: "An application for `text-to-image`, `image-to-image` and image inpainting.", id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI", }, ], diff --git a/packages/tasks/src/tasks/text-to-video/data.ts b/packages/tasks/src/tasks/text-to-video/data.ts index b9639e1f5..072d1394f 100644 --- a/packages/tasks/src/tasks/text-to-video/data.ts +++ b/packages/tasks/src/tasks/text-to-video/data.ts @@ -68,7 +68,7 @@ const taskData: TaskDataCustom = { models: [ { description: "A strong model for video generation.", - id: "PAIR/text2video-zero-controlnet-canny-arcane", + id: "Vchitect/LaVie", }, { description: "A robust model for text-to-video generation.", @@ -76,7 +76,7 @@ const taskData: TaskDataCustom = { }, { description: "A text-to-video generation model with high quality and smooth outputs.", - id: "cerspense/zeroscope_v2_576w", + id: "hotshotco/Hotshot-XL", }, ], spaces: [ @@ -86,7 +86,7 @@ const taskData: TaskDataCustom = { }, { description: "An application that generates video from image and text.", - id: "TempoFunk/makeavid-sd-jax", + id: "Vchitect/LaVie", }, { description: "An application that generates videos from text and provides multi-model support.", diff --git a/packages/tasks/src/tasks/visual-question-answering/data.ts b/packages/tasks/src/tasks/visual-question-answering/data.ts index be140665d..2d94edd42 100644 --- a/packages/tasks/src/tasks/visual-question-answering/data.ts +++ b/packages/tasks/src/tasks/visual-question-answering/data.ts @@ -71,6 +71,10 @@ const taskData: TaskDataCustom = { }, ], spaces: [ + { + description: "An application that compares visual question answering models across different tasks.", + id: "merve/pix2struct", + }, { description: "An application that can answer questions based on images.", id: "nielsr/vilt-vqa",