From 902f33e56bb94701378acc9efba0316590d4eee8 Mon Sep 17 00:00:00 2001 From: cwx-worst-one <1029713857@qq.com> Date: Fri, 18 Oct 2024 06:10:46 +0000 Subject: [PATCH 1/5] test --- examples/slam_aac/README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/slam_aac/README.md b/examples/slam_aac/README.md index eb39d15..7b8ddd3 100644 --- a/examples/slam_aac/README.md +++ b/examples/slam_aac/README.md @@ -1,7 +1,6 @@ # SLAM-AAC -SLAM-AAC is a LLM-based model for Automated Audio Captioning (AAC) task. Inspired by techniques in machine translation and ASR, the model enhances audio captioning by incorporating paraphrasing augmentation and a plug-and-play CLAP-Refine strategy. - +SLAM-AAC is a LLM-based model for Automated Audio Captioning (AAC) task. Inspired by techniques in machine translation and ASR, the model enhances audio captioning by incorporating paraphrasing augmentation and a plug-and-play CLAP-Refine strategy. For more details, please refer to the [paper](https://arxiv.org/abs/2410.09503). ## Model Architecture SLAM-AAC uses EAT as the audio encoder and Vicuna-7B as the LLM decoder. During training, only the Linear Projector and LoRA modules are trainable. For inference, multiple candidates are generated using different beam sizes, which are then refined using the CLAP-Refine strategy. @@ -81,8 +80,13 @@ If you already have the generated candidates and want to directly refine them us bash scripts/clap_refine.sh ``` - +@article{chen2024slam, + title={SLAM-AAC: Enhancing Audio Captioning with Paraphrasing Augmentation and CLAP-Refine through LLMs}, + author={Chen, Wenxi and Ma, Ziyang and Li, Xiquan and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Yu, Kai and Chen, Xie}, + journal={arXiv preprint arXiv:2410.09503}, + year={2024} +} +``` From 1738c0103c099942baee78aef5e3235b5ce28c70 Mon Sep 17 00:00:00 2001 From: cwx-worst-one <1029713857@qq.com> Date: Fri, 18 Oct 2024 06:22:25 +0000 Subject: [PATCH 2/5] sloth --- README.md | 28 ++++++++++++++++++++-------- examples/slam_aac/aac_config.py | 6 +++++- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ad1fe82..ca970bd 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,15 @@ developers to train custom multimodal large language model (MLLM), focusing on < # Table of Contents -1. [News](#news) -2. [Installation](#installation) -3. [Uasge](#uasge) - - [List of Recipes](#list-of-recipes) - - [Configuration Priority](#configuration-priority) -4. [Features](#features) -5. [Acknowledge](#acknowledge) -6. [Citation](#citation) +- [Table of Contents](#table-of-contents) +- [News](#news) +- [Installation](#installation) +- [Usage](#usage) + - [List of Recipes](#list-of-recipes) + - [Configuration Priority](#configuration-priority) +- [Features](#features) +- [Acknowledge](#acknowledge) + - [Citation](#citation) # News - [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) have been supported. @@ -129,3 +130,14 @@ SLAM-ASR: } ``` +SLAM-AAC: +``` +@article{chen2024slam, + title={SLAM-AAC: Enhancing Audio Captioning with Paraphrasing Augmentation and CLAP-Refine through LLMs}, + author={Chen, Wenxi and Ma, Ziyang and Li, Xiquan and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Yu, Kai and Chen, Xie}, + journal={arXiv preprint arXiv:2410.09503}, + year={2024} +} +``` + + diff --git a/examples/slam_aac/aac_config.py b/examples/slam_aac/aac_config.py index 50fca27..9fb747b 100644 --- a/examples/slam_aac/aac_config.py +++ b/examples/slam_aac/aac_config.py @@ -1,5 +1,9 @@ from dataclasses import dataclass, field from typing import Optional, List + +from torch.distributed.fsdp import ShardingStrategy + + @dataclass class ModelConfig: file: str = "examples/slam_aac/model/slam_model_aac.py:model_factory" @@ -125,7 +129,7 @@ class FSDPConfig: mixed_precision: bool = True use_fp16: bool = False # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD - sharding_strategy: str = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + sharding_strategy: ShardingStrategy = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. fsdp_activation_checkpointing: bool = True fsdp_cpu_offload: bool = False From 9ea77163a8f8150f8054d702a641e7a06f32f391 Mon Sep 17 00:00:00 2001 From: cwx-worst-one <1029713857@qq.com> Date: Fri, 18 Oct 2024 06:29:20 +0000 Subject: [PATCH 3/5] sloth --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index ca970bd..2a1988f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ developers to train custom multimodal large language model (MLLM), focusing on < # Table of Contents -- [Table of Contents](#table-of-contents) - [News](#news) - [Installation](#installation) - [Usage](#usage) From f5505fd83b47ae352c89a1e8f2e2163395f3ccdc Mon Sep 17 00:00:00 2001 From: cwx-worst-one <1029713857@qq.com> Date: Fri, 18 Oct 2024 06:30:52 +0000 Subject: [PATCH 4/5] test --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 2a1988f..89f4195 100644 --- a/README.md +++ b/README.md @@ -18,14 +18,14 @@ developers to train custom multimodal large language model (MLLM), focusing on < # Table of Contents -- [News](#news) -- [Installation](#installation) -- [Usage](#usage) +1. [News](#news) +2. [Installation](#installation) +3. [Usage](#usage) - [List of Recipes](#list-of-recipes) - [Configuration Priority](#configuration-priority) -- [Features](#features) -- [Acknowledge](#acknowledge) - - [Citation](#citation) +4. [Features](#features) +5. [Acknowledge](#acknowledge) +6. [Citation](#citation) # News - [Update Oct. 12, 2024] Recipes for [SLAM-AAC](examples/slam_aac/README.md) have been supported. From 837350096e7d4c1c9b5c53ce15b034015f3048c1 Mon Sep 17 00:00:00 2001 From: cwx-worst-one <1029713857@qq.com> Date: Fri, 18 Oct 2024 06:32:40 +0000 Subject: [PATCH 5/5] test --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 89f4195..0fd38d0 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ developers to train custom multimodal large language model (MLLM), focusing on < 1. [News](#news) 2. [Installation](#installation) 3. [Usage](#usage) - - [List of Recipes](#list-of-recipes) - - [Configuration Priority](#configuration-priority) + - [List of Recipes](#list-of-recipes) + - [Configuration Priority](#configuration-priority) 4. [Features](#features) 5. [Acknowledge](#acknowledge) 6. [Citation](#citation)