computer_vision.bib

@article{DBLP:journals/corr/LuXPS16,
  author = {Jiasen Lu and Caiming Xiong and Devi Parikh and Richard Socher},
  title = {Knowing When to Look: Adaptive Attention via {A} Visual Sentinel for
           Image Captioning},
  journal = {CoRR},
  volume = {abs/1612.01887},
  year = {2016},
  url = {http://arxiv.org/abs/1612.01887},
  archivePrefix = {arXiv},
  eprint = {1612.01887},
  timestamp = {Mon, 13 Aug 2018 16:46:57 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/LuXPS16},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/XuBKCCSZB15,
  author = {Kelvin Xu and Jimmy Ba and Ryan Kiros and Kyunghyun Cho and Aaron C.
            Courville and Ruslan Salakhutdinov and Richard S. Zemel and Yoshua
            Bengio},
  title = {Show, Attend and Tell: Neural Image Caption Generation with Visual
           Attention},
  journal = {CoRR},
  volume = {abs/1502.03044},
  year = {2015},
  url = {http://arxiv.org/abs/1502.03044},
  archivePrefix = {arXiv},
  eprint = {1502.03044},
  timestamp = {Mon, 13 Aug 2018 16:47:52 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/XuBKCCSZB15},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/RennieMMRG16,
  author = {Steven J. Rennie and Etienne Marcheret and Youssef Mroueh and Jarret
            Ross and Vaibhava Goel},
  title = {Self-critical Sequence Training for Image Captioning},
  journal = {CoRR},
  volume = {abs/1612.00563},
  year = {2016},
  url = {http://arxiv.org/abs/1612.00563},
  archivePrefix = {arXiv},
  eprint = {1612.00563},
  timestamp = {Mon, 13 Aug 2018 16:47:39 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/RennieMMRG16},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/AndersonHBTJGZ17,
  author = {Peter Anderson and Xiaodong He and Chris Buehler and Damien Teney
            and Mark Johnson and Stephen Gould and Lei Zhang},
  title = {Bottom-Up and Top-Down Attention for Image Captioning and {VQA}},
  journal = {CoRR},
  volume = {abs/1707.07998},
  year = {2017},
  url = {http://arxiv.org/abs/1707.07998},
  archivePrefix = {arXiv},
  eprint = {1707.07998},
  timestamp = {Tue, 20 Nov 2018 12:24:39 +0100},
  biburl = {https://dblp.org/rec/bib/journals/corr/AndersonHBTJGZ17},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/YinO17,
  author = {Xuwang Yin and Vicente Ordonez},
  title = {{OBJ2TEXT:} Generating Visually Descriptive Language from Object
           Layouts},
  journal = {CoRR},
  volume = {abs/1707.07102},
  year = {2017},
  url = {http://arxiv.org/abs/1707.07102},
  archivePrefix = {arXiv},
  eprint = {1707.07102},
  timestamp = {Mon, 13 Aug 2018 16:49:07 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/YinO17},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/abs-1905-11946,
  author = {Mingxing Tan and Quoc V. Le},
  title = {EfficientNet: Rethinking Model Scaling for Convolutional Neural
           Networks},
  journal = {CoRR},
  volume = {abs/1905.11946},
  year = {2019},
  url = {http://arxiv.org/abs/1905.11946},
  archivePrefix = {arXiv},
  eprint = {1905.11946},
  timestamp = {Mon, 03 Jun 2019 13:42:33 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1905-11946},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/abs-1803-08314,
  author = {Xihui Liu and Hongsheng Li and Jing Shao and Dapeng Chen and
            Xiaogang Wang},
  title = {Show, Tell and Discriminate: Image Captioning by Self-retrieval with
           Partially Labeled Data},
  journal = {CoRR},
  volume = {abs/1803.08314},
  year = {2018},
  url = {http://arxiv.org/abs/1803.08314},
  archivePrefix = {arXiv},
  eprint = {1803.08314},
  timestamp = {Mon, 13 Aug 2018 16:47:36 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1803-08314},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/KilickayaEIE16,
  author = {Mert Kilickaya and Aykut Erdem and Nazli Ikizler{-}Cinbis and Erkut
            Erdem},
  title = {Re-evaluating Automatic Metrics for Image Captioning},
  journal = {CoRR},
  volume = {abs/1612.07600},
  year = {2016},
  url = {http://arxiv.org/abs/1612.07600},
  archivePrefix = {arXiv},
  eprint = {1612.07600},
  timestamp = {Mon, 13 Aug 2018 16:48:05 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/KilickayaEIE16},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/BartzYM17,
  author = {Christian Bartz and Haojin Yang and Christoph Meinel},
  title = {{STN-OCR:} {A} single Neural Network for Text Detection and Text
           Recognition},
  journal = {CoRR},
  volume = {abs/1707.08831},
  year = {2017},
  url = {http://arxiv.org/abs/1707.08831},
  archivePrefix = {arXiv},
  eprint = {1707.08831},
  timestamp = {Mon, 13 Aug 2018 16:48:31 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/BartzYM17},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@inproceedings{jianfeng2017deep,
  author = {Wang, Jianfeng and Hu, Xiaolin},
  title = {Gated Recurrent Convolution Neural Network for OCR},
  booktitle = {Advances in Neural Information Processing Systems},
  year = {2017},
}

@article{DBLP:journals/corr/abs-1906-01969,
  author = {Marcin Namysl and Iuliu Konya},
  title = {Efficient, Lexicon-Free {OCR} using Deep Learning},
  journal = {CoRR},
  volume = {abs/1906.01969},
  year = {2019},
  url = {http://arxiv.org/abs/1906.01969},
  archivePrefix = {arXiv},
  eprint = {1906.01969},
  timestamp = {Thu, 13 Jun 2019 13:36:00 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1906-01969},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{DBLP:journals/corr/abs-1906-00446,
  author = {Ali Razavi and A{\"{a}}ron van den Oord and Oriol Vinyals},
  title = {Generating Diverse High-Fidelity Images with {VQ-VAE-2}},
  journal = {CoRR},
  volume = {abs/1906.00446},
  year = {2019},
  url = {http://arxiv.org/abs/1906.00446},
  archivePrefix = {arXiv},
  eprint = {1906.00446},
  timestamp = {Thu, 13 Jun 2019 13:36:00 +0200},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1906-00446},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@misc{wang2019cnngenerated,
  title = {CNN-generated images are surprisingly easy to spot... for now},
  author = {Sheng-Yu Wang and Oliver Wang and Richard Zhang and Andrew Owens and
            Alexei A. Efros},
  year = {2019},
  eprint = {1912.11035},
  archivePrefix = {arXiv},
  primaryClass = {cs.CV},
}

@article{DBLP:journals/corr/abs-1812-04948,
  author = {Tero Karras and Samuli Laine and Timo Aila},
  title = {A Style-Based Generator Architecture for Generative Adversarial
           Networks},
  journal = {CoRR},
  volume = {abs/1812.04948},
  year = {2018},
  url = {http://arxiv.org/abs/1812.04948},
  archivePrefix = {arXiv},
  eprint = {1812.04948},
  timestamp = {Tue, 01 Jan 2019 15:01:25 +0100},
  biburl = {https://dblp.org/rec/bib/journals/corr/abs-1812-04948},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@article{Karras2019stylegan2,
  title = {Analyzing and Improving the Image Quality of {StyleGAN}},
  author = {Tero Karras and Samuli Laine and Miika Aittala and Janne Hellsten
            and Jaakko Lehtinen and Timo Aila},
  journal = {CoRR},
  volume = {abs/1912.04958},
  year = {2019},
}

@misc{cornia2019m2,
  title = {M$^2$: Meshed-Memory Transformer for Image Captioning},
  author = {Marcella Cornia and Matteo Stefanini and Lorenzo Baraldi and Rita
            Cucchiara},
  year = {2019},
  eprint = {1912.08226},
  archivePrefix = {arXiv},
  primaryClass = {cs.CV},
}

@misc{perezrua2020knowing,
  title = {Knowing What, Where and When to Look: Efficient Video Action Modeling
           with Attention},
  author = {Juan-Manuel Perez-Rua and Brais Martinez and Xiatian Zhu and Antoine
            Toisoul and Victor Escorcia and Tao Xiang},
  year = {2020},
  eprint = {2004.01278},
  archivePrefix = {arXiv},
  primaryClass = {cs.CV},
}

@misc{luo2020analysis,
  title = {Analysis of diversity-accuracy tradeoff in image captioning},
  author = {Ruotian Luo and Gregory Shakhnarovich},
  year = {2020},
  eprint = {2002.11848},
  archivePrefix = {arXiv},
  primaryClass = {cs.CL},
}

@misc{https://doi.org/10.48550/arxiv.2102.05095,
  doi = {10.48550/ARXIV.2102.05095},
  url = {https://arxiv.org/abs/2102.05095},
  author = {Bertasius, Gedas and Wang, Heng and Torresani, Lorenzo},
  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and
              information sciences, FOS: Computer and information sciences},
  title = {Is Space-Time Attention All You Need for Video Understanding?},
  publisher = {arXiv},
  year = {2021},
  copyright = {arXiv.org perpetual, non-exclusive license},
}

@article{DBLP:journals/corr/abs-2201-12086,
  author = {Junnan Li and Dongxu Li and Caiming Xiong and Steven C. H. Hoi},
  title = {{BLIP:} Bootstrapping Language-Image Pre-training for Unified
           Vision-Language Understanding and Generation},
  journal = {CoRR},
  volume = {abs/2201.12086},
  year = {2022},
  url = {https://arxiv.org/abs/2201.12086},
  eprinttype = {arXiv},
  eprint = {2201.12086},
  timestamp = {Wed, 02 Feb 2022 15:00:01 +0100},
  biburl = {https://dblp.org/rec/journals/corr/abs-2201-12086.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@misc{https://doi.org/10.48550/arxiv.2205.13115,
  doi = {10.48550/ARXIV.2205.13115},
  url = {https://arxiv.org/abs/2205.13115},
  author = {Cho, Jaemin and Yoon, Seunghyun and Kale, Ajinkya and Dernoncourt,
            Franck and Bui, Trung and Bansal, Mohit},
  keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
              Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and
              information sciences, FOS: Computer and information sciences},
  title = {Fine-grained Image Captioning with CLIP Reward},
  publisher = {arXiv},
  year = {2022},
  copyright = {arXiv.org perpetual, non-exclusive license},
}

@inproceedings{ge2024visual,
  title = {Visual Fact Checker: Enabling High-Fidelity Detailed Caption
           Generation},
  author = {Ge, Yunhao and Zeng, Xiaohui and Huffman, Jacob Samuel and Lin,
            Tsung-Yi and Liu, Ming-Yu and Cui, Yin},
  booktitle = {IEEE Conference on Computer Vision and Pattern Recognition ({CVPR
               })},
  year = {2024},
}