Update models and Spaces for computer vision applications (#412)

Co-authored-by: Pedro Cuenca <[email protected]>
huggingface · Dec 13, 2023 · 98c9768 · 98c9768
1 parent 29bce71
commit 98c9768
Show file tree

Hide file tree

Showing 7 changed files with 46 additions and 35 deletions.
diff --git a/packages/tasks/src/tasks/depth-estimation/data.ts b/packages/tasks/src/tasks/depth-estimation/data.ts
@@ -24,14 +24,16 @@ const taskData: TaskDataCustom = {
 	metrics: [],
 	models: [
 		{
-			// TO DO: write description
 			description: "Strong Depth Estimation model trained on 1.4 million images.",
 			id: "Intel/dpt-large",
 		},
 		{
-			// TO DO: write description
 			description: "Strong Depth Estimation model trained on the KITTI dataset.",
-			id: "vinvino02/glpn-kitti",
+			id: "facebook/dpt-dinov2-large-kitti",
+		},
+		{
+			description: "A strong monocular depth estimation model.",
+			id: "Bingxin/Marigold",
 		},
 	],
 	spaces: [

diff --git a/packages/tasks/src/tasks/document-question-answering/data.ts b/packages/tasks/src/tasks/document-question-answering/data.ts
@@ -50,6 +50,10 @@ const taskData: TaskDataCustom = {
 			description: "A special model for OCR-free Document QA task. Donut model fine-tuned on DocVQA.",
 			id: "naver-clova-ix/donut-base-finetuned-docvqa",
 		},
+		{
+			description: "A powerful model for document question answering.",
+			id: "google/pix2struct-docvqa-large",
+		},
 	],
 	spaces: [
 		{
@@ -60,6 +64,10 @@ const taskData: TaskDataCustom = {
 			description: "An application that can answer questions from invoices.",
 			id: "impira/invoices",
 		},
+		{
+			description: "An application to compare different document question answering models.",
+			id: "merve/compare_docvqa_models",
+		},
 	],
 	summary:
 		"Document Question Answering (also known as Document Visual Question Answering) is the task of answering questions on document images. Document question answering models take a (document, question) pair as input and return an answer in natural language. Models usually rely on multi-modal features, combining text, position of words (bounding-boxes) and image.",

diff --git a/packages/tasks/src/tasks/image-to-text/data.ts b/packages/tasks/src/tasks/image-to-text/data.ts
@@ -32,30 +32,22 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "A robust image captioning model.",
-			id: "Salesforce/blip-image-captioning-large",
+			id: "Salesforce/blip2-opt-2.7b",
 		},
 		{
-			description: "A strong image captioning model.",
-			id: "nlpconnect/vit-gpt2-image-captioning",
+			description: "A powerful and accurate image-to-text model that can also localize concepts in images.",
+			id: "microsoft/kosmos-2-patch14-224",
 		},
 		{
 			description: "A strong optical character recognition model.",
-			id: "microsoft/trocr-base-printed",
-		},
-		{
-			description: "A strong visual question answering model for scientific diagrams.",
-			id: "google/pix2struct-ai2d-base",
-		},
-		{
-			description: "A strong captioning model for UI components.",
-			id: "google/pix2struct-widget-captioning-base",
-		},
-		{
-			description: "A captioning model for images that contain text.",
-			id: "google/pix2struct-textcaps-base",
+			id: "facebook/nougat-base",
 		},
 	],
 	spaces: [
+		{
+			description: "An application that compares various image captioning models.",
+			id: "nielsr/comparing-captioning-models",
+		},
 		{
 			description: "A robust image captioning application.",
 			id: "flax-community/image-captioning",

diff --git a/packages/tasks/src/tasks/object-detection/data.ts b/packages/tasks/src/tasks/object-detection/data.ts
@@ -40,7 +40,6 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			// TO DO: write description
 			description: "Solid object detection model trained on the benchmark dataset COCO 2017.",
 			id: "facebook/detr-resnet-50",
 		},
@@ -50,9 +49,13 @@ const taskData: TaskDataCustom = {
 		},
 	],
 	spaces: [
+		{
+			description: "Leaderboard to compare various object detection models across several metrics.",
+			id: "hf-vision/object_detection_leaderboard",
+		},
 		{
 			description: "An object detection application that can detect unseen objects out of the box.",
-			id: "adirik/OWL-ViT",
+			id: "merve/owlv2",
 		},
 		{
 			description: "An application that contains various object detection models to try from.",

diff --git a/packages/tasks/src/tasks/text-to-image/data.ts b/packages/tasks/src/tasks/text-to-image/data.ts
@@ -45,14 +45,12 @@ const taskData: TaskDataCustom = {
 	],
 	models: [
 		{
-			description:
-				"A latent text-to-image diffusion model capable of generating photo-realistic images given any text input.",
-			id: "CompVis/stable-diffusion-v1-4",
+			description: "One of the most powerful image generation models that can generate realistic outputs.",
+			id: "stabilityai/stable-diffusion-xl-base-1.0",
 		},
 		{
-			description:
-				"A model that can be used to generate images based on text prompts. The DALL·E Mega model is the largest version of DALLE Mini.",
-			id: "dalle-mini/dalle-mega",
+			description: "A powerful yet fast image generation model.",
+			id: "latent-consistency/lcm-lora-sdxl",
 		},
 		{
 			description: "A text-to-image model that can generate coherent text inside image.",
@@ -69,19 +67,23 @@ const taskData: TaskDataCustom = {
 			id: "stabilityai/stable-diffusion",
 		},
 		{
-			description: "An text-to-image application that can generate coherent text inside the image.",
+			description: "A text-to-image application to generate comics.",
+			id: "jbilcke-hf/ai-comic-factory",
+		},
+		{
+			description: "A text-to-image application that can generate coherent text inside the image.",
 			id: "DeepFloyd/IF",
 		},
 		{
-			description: "An powerful text-to-image application that can generate images.",
-			id: "kakaobrain/karlo",
+			description: "A powerful yet very fast image generation application.",
+			id: "latent-consistency/lcm-lora-for-sdxl",
 		},
 		{
-			description: "An powerful text-to-image application that can generates 3D representations.",
+			description: "A powerful text-to-image application that can generate 3D representations.",
 			id: "hysts/Shap-E",
 		},
 		{
-			description: "A strong application for `text-to-image`, `image-to-image` and image inpainting.",
+			description: "An application for `text-to-image`, `image-to-image` and image inpainting.",
 			id: "ArtGAN/Stable-Diffusion-ControlNet-WebUI",
 		},
 	],

diff --git a/packages/tasks/src/tasks/text-to-video/data.ts b/packages/tasks/src/tasks/text-to-video/data.ts
@@ -68,15 +68,15 @@ const taskData: TaskDataCustom = {
 	models: [
 		{
 			description: "A strong model for video generation.",
-			id: "PAIR/text2video-zero-controlnet-canny-arcane",
+			id: "Vchitect/LaVie",
 		},
 		{
 			description: "A robust model for text-to-video generation.",
 			id: "damo-vilab/text-to-video-ms-1.7b",
 		},
 		{
 			description: "A text-to-video generation model with high quality and smooth outputs.",
-			id: "cerspense/zeroscope_v2_576w",
+			id: "hotshotco/Hotshot-XL",
 		},
 	],
 	spaces: [
@@ -86,7 +86,7 @@ const taskData: TaskDataCustom = {
 		},
 		{
 			description: "An application that generates video from image and text.",
-			id: "TempoFunk/makeavid-sd-jax",
+			id: "Vchitect/LaVie",
 		},
 		{
 			description: "An application that generates videos from text and provides multi-model support.",

diff --git a/packages/tasks/src/tasks/visual-question-answering/data.ts b/packages/tasks/src/tasks/visual-question-answering/data.ts
@@ -71,6 +71,10 @@ const taskData: TaskDataCustom = {
 		},
 	],
 	spaces: [
+		{
+			description: "An application that compares visual question answering models across different tasks.",
+			id: "merve/pix2struct",
+		},
 		{
 			description: "An application that can answer questions based on images.",
 			id: "nielsr/vilt-vqa",