diff --git a/mmdet/models/detectors/glip.py b/mmdet/models/detectors/glip.py index 45cfe7d39fd..80c8a1fb872 100644 --- a/mmdet/models/detectors/glip.py +++ b/mmdet/models/detectors/glip.py @@ -2,6 +2,7 @@ import copy import re import warnings +from os.path import expanduser from typing import Optional, Tuple, Union import torch @@ -13,7 +14,7 @@ from .single_stage import SingleStageDetector -def find_noun_phrases(caption: str) -> list: +def find_noun_phrases(caption: str, verbose: bool = True) -> list: """Find noun phrases in a caption using nltk. Args: caption (str): The caption to analyze. @@ -27,8 +28,12 @@ def find_noun_phrases(caption: str) -> list: """ try: import nltk - nltk.download('punkt', download_dir='~/nltk_data') - nltk.download('averaged_perceptron_tagger', download_dir='~/nltk_data') + nltk.download( + 'punkt', download_dir=expanduser('~/nltk_data'), quiet=not verbose) + nltk.download( + 'averaged_perceptron_tagger', + download_dir=expanduser('~/nltk_data'), + quiet=not verbose) except ImportError: raise RuntimeError('nltk is not installed, please install it by: ' 'pip install nltk.') @@ -66,7 +71,7 @@ def remove_punctuation(text: str) -> str: return text.strip() -def run_ner(caption: str) -> Tuple[list, list]: +def run_ner(caption: str, verbose: bool = False) -> Tuple[list, list]: """Run NER on a caption and return the tokens and noun phrases. Args: caption (str): The input caption. @@ -76,10 +81,11 @@ def run_ner(caption: str) -> Tuple[list, list]: - tokens_positive (List): A list of token positions. - noun_phrases (List): A list of noun phrases. """ - noun_phrases = find_noun_phrases(caption) + noun_phrases = find_noun_phrases(caption, verbose=verbose) noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases] noun_phrases = [phrase for phrase in noun_phrases if phrase != ''] - print('noun_phrases:', noun_phrases) + if verbose: + print('noun_phrases:', noun_phrases) relevant_phrases = noun_phrases labels = noun_phrases @@ -271,11 +277,11 @@ def to_plain_text_prompts(self, original_caption): return caption_string, tokens_positive def get_tokens_and_prompts( - self, - original_caption: Union[str, list, tuple], - custom_entities: bool = False, - enhanced_text_prompts: Optional[ConfigType] = None - ) -> Tuple[dict, str, list, list]: + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False, + enhanced_text_prompts: Optional[ConfigType] = None, + verbose: bool = False) -> Tuple[dict, str, list, list]: """Get the tokens positive and prompts for the caption.""" if isinstance(original_caption, (list, tuple)) or custom_entities: if custom_entities and isinstance(original_caption, str): @@ -300,7 +306,8 @@ def get_tokens_and_prompts( original_caption = original_caption.strip(self._special_tokens) tokenized = self.language_model.tokenizer([original_caption], return_tensors='pt') - tokens_positive, noun_phrases = run_ner(original_caption) + tokens_positive, noun_phrases = run_ner( + original_caption, verbose=verbose) entities = noun_phrases caption_string = original_caption @@ -313,12 +320,12 @@ def get_positive_map(self, tokenized, tokens_positive): return positive_map_label_to_token, positive_map def get_tokens_positive_and_prompts( - self, - original_caption: Union[str, list, tuple], - custom_entities: bool = False, - enhanced_text_prompt: Optional[ConfigType] = None, - tokens_positive: Optional[list] = None, - ) -> Tuple[dict, str, Tensor, list]: + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False, + enhanced_text_prompt: Optional[ConfigType] = None, + tokens_positive: Optional[list] = None, + verbose: bool = False) -> Tuple[dict, str, Tensor, list]: if tokens_positive is not None: if tokens_positive == -1: if not original_caption.endswith('.'): @@ -354,7 +361,8 @@ def get_tokens_positive_and_prompts( else: tokenized, caption_string, tokens_positive, entities = \ self.get_tokens_and_prompts( - original_caption, custom_entities, enhanced_text_prompt) + original_caption, custom_entities, enhanced_text_prompt, + verbose=verbose) positive_map_label_to_token, positive_map = self.get_positive_map( tokenized, tokens_positive) if tokenized.input_ids.shape[1] > self.language_model.max_tokens: @@ -367,7 +375,8 @@ def get_tokens_positive_and_prompts( def get_tokens_positive_and_prompts_chunked( self, original_caption: Union[list, tuple], - enhanced_text_prompts: Optional[ConfigType] = None): + enhanced_text_prompts: Optional[ConfigType] = None, + verbose: bool = False): chunked_size = self.test_cfg.get('chunked_size', -1) original_caption = [clean_label_name(i) for i in original_caption] @@ -408,8 +417,10 @@ def get_tokens_positive_and_prompts_chunked( positive_map_chunked, \ entities_chunked - def loss(self, batch_inputs: Tensor, - batch_data_samples: SampleList) -> Union[dict, list]: + def loss(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + verbose: bool = False) -> Union[dict, list]: # TODO: Only open vocabulary tasks are supported for training now. text_prompts = [ data_samples.text for data_samples in batch_data_samples @@ -427,7 +438,7 @@ def loss(self, batch_inputs: Tensor, # so there is no need to calculate them multiple times. tokenized, caption_string, tokens_positive, _ = \ self.get_tokens_and_prompts( - text_prompts[0], True) + text_prompts[0], True, verbose=verbose) new_text_prompts = [caption_string] * len(batch_inputs) for gt_label in gt_labels: new_tokens_positive = [ @@ -440,7 +451,7 @@ def loss(self, batch_inputs: Tensor, for text_prompt, gt_label in zip(text_prompts, gt_labels): tokenized, caption_string, tokens_positive, _ = \ self.get_tokens_and_prompts( - text_prompt, True) + text_prompt, True, verbose=verbose) new_tokens_positive = [ tokens_positive[label] for label in gt_label ] diff --git a/mmdet/models/detectors/grounding_dino.py b/mmdet/models/detectors/grounding_dino.py index b1ab7c2da16..fc89b90f905 100644 --- a/mmdet/models/detectors/grounding_dino.py +++ b/mmdet/models/detectors/grounding_dino.py @@ -132,11 +132,11 @@ def to_plain_text_prompts(self, original_caption): return caption_string, tokens_positive def get_tokens_and_prompts( - self, - original_caption: Union[str, list, tuple], - custom_entities: bool = False, - enhanced_text_prompts: Optional[ConfigType] = None - ) -> Tuple[dict, str, list]: + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False, + enhanced_text_prompts: Optional[ConfigType] = None, + verbose: bool = False) -> Tuple[dict, str, list]: """Get the tokens positive and prompts for the caption.""" if isinstance(original_caption, (list, tuple)) or custom_entities: if custom_entities and isinstance(original_caption, str): @@ -176,7 +176,8 @@ def get_tokens_and_prompts( padding='max_length' if self.language_model.pad_to_max else 'longest', return_tensors='pt') - tokens_positive, noun_phrases = run_ner(original_caption) + tokens_positive, noun_phrases = run_ner( + original_caption, verbose=verbose) entities = noun_phrases caption_string = original_caption @@ -193,12 +194,12 @@ def get_positive_map(self, tokenized, tokens_positive): return positive_map_label_to_token, positive_map def get_tokens_positive_and_prompts( - self, - original_caption: Union[str, list, tuple], - custom_entities: bool = False, - enhanced_text_prompt: Optional[ConfigType] = None, - tokens_positive: Optional[list] = None, - ) -> Tuple[dict, str, Tensor, list]: + self, + original_caption: Union[str, list, tuple], + custom_entities: bool = False, + enhanced_text_prompt: Optional[ConfigType] = None, + tokens_positive: Optional[list] = None, + verbose: bool = False) -> Tuple[dict, str, Tensor, list]: """Get the tokens positive and prompts for the caption. Args: @@ -250,7 +251,8 @@ def get_tokens_positive_and_prompts( else: tokenized, caption_string, tokens_positive, entities = \ self.get_tokens_and_prompts( - original_caption, custom_entities, enhanced_text_prompt) + original_caption, custom_entities, enhanced_text_prompt, + verbose=verbose) positive_map_label_to_token, positive_map = self.get_positive_map( tokenized, tokens_positive) return positive_map_label_to_token, caption_string, \ @@ -416,8 +418,10 @@ def pre_decoder( head_inputs_dict['text_token_mask'] = text_token_mask return decoder_inputs_dict, head_inputs_dict - def loss(self, batch_inputs: Tensor, - batch_data_samples: SampleList) -> Union[dict, list]: + def loss(self, + batch_inputs: Tensor, + batch_data_samples: SampleList, + verbose: bool = False) -> Union[dict, list]: text_prompts = [ data_samples.text for data_samples in batch_data_samples ] @@ -455,7 +459,7 @@ def loss(self, batch_inputs: Tensor, # so there is no need to calculate them multiple times. tokenized, caption_string, tokens_positive, _ = \ self.get_tokens_and_prompts( - text_prompts[0], True) + text_prompts[0], True, verbose=verbose) new_text_prompts = [caption_string] * len(batch_inputs) for gt_label in gt_labels: new_tokens_positive = [ @@ -468,7 +472,7 @@ def loss(self, batch_inputs: Tensor, for text_prompt, gt_label in zip(text_prompts, gt_labels): tokenized, caption_string, tokens_positive, _ = \ self.get_tokens_and_prompts( - text_prompt, True) + text_prompt, True, verbose=verbose) new_tokens_positive = [ tokens_positive[label] for label in gt_label ]