@article {3511,
	title = {DeepVoting: A Robust and Explainable Deep Network for Semantic Part Detection under Partial Occlusion},
	year = {2018},
	month = {06/2018},
	abstract = {<p>In this paper, we study the task of detecting semantic parts of an object, e.g., a wheel of a car, under partial occlusion. We propose that all models should be trained without seeing occlusions while being able to transfer the learned knowledge to deal with occlusions. This setting alleviates the diffi- culty in collecting an exponentially large dataset to cover occlusion patterns and is more essential. In this scenario, the proposal-based deep networks, like RCNN-series, often produce unsatisfactory re- sults, because both the proposal extraction and classification stages may be confused by the irrelevant occluders. To address this, [25] proposed a voting mechanism that combines multiple local visual cues to detect semantic parts. The semantic parts can still be detected even though some visual cues are missing due to occlusions. However, this method is manually-designed, thus is hard to be optimized in an end-to-end manner.</p>

<p>In this paper, we present DeepVoting, which incorporates the robustness shown by [25] into a deep network, so that the whole pipeline can be jointly optimized. Specifically, it adds two layers after the intermediate features of a deep network, e.g., the pool-4 layer of VGGNet. The first layer extracts the evidence of local visual cues, and the second layer performs a voting mechanism by utilizing the spatial relationship between visual cues and semantic parts. We also propose an improved version DeepVoting+ by learning visual cues from context outside objects. In experiments, DeepVoting achieves significantly better performance than several baseline methods, including Faster-RCNN, for semantic part detection under occlusion. In addition, DeepVoting enjoys explainability as the detection results can be diagnosed via looking up the voting cues.</p>
},
	author = {Zhishuai Zhang and Cihang Xie and Jianyu Wang and Lingxi Xie and Alan Yuille}
}
@conference {3548,
	title = {DeepVoting: An Explainable Framework for Semantic Part Detection under Partial Occlusion},
	booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)},
	year = {2018},
	month = {06/2018},
	address = {Salt Lake City, Utah},
	abstract = {<p>In this paper, we study the task of detecting semantic parts of an object, e.g., a wheel of a car, under partial occlusion. We propose that all models should be trained without seeing occlusions while being able to transfer the learned knowledge to deal with occlusions. This setting alleviates the difficulty in collecting an exponentially large dataset to cover occlusion patterns and is more essential. In this scenario, the proposal-based deep networks, like RCNN-series, often produce unsatisfactory results, because both the proposal extraction and classification stages may be confused by the irrelevant occluders. To address this, [25] proposed a voting mechanism that combines multiple local visual cues to detect semantic parts. The semantic parts can still be detected even though some visual cues are missing due to occlusions. However, this method is manually-designed, thus is hard to be optimized in an end-to-end manner. In this paper, we present DeepVoting, which incorporates the robustness shown by [25] into a deep network, so that the whole pipeline can be jointly optimized. Specifically, it adds two layers after the intermediate features of a deep network, e.g., the pool-4 layer of VGGNet. The first layer extracts the evidence of local visual cues, and the second layer performs a voting mechanism by utilizing the spatial relationship between visual cues and semantic parts. We also propose an improved version DeepVoting+ by learning visual cues from context outside objects. In experiments, DeepVoting achieves significantly better performance than several baseline methods, including Faster-RCNN, for semantic part detection under occlusion. In addition, DeepVoting enjoys explainability as the detection results can be diagnosed via looking up the voting cues.</p>
},
	url = {http://cvpr2018.thecvf.com/},
	author = {Zhishuai Zhang and Cihang Xie and Jianyu Wang and Lingxi Xie and Alan Yuille}
}
@article {3509,
	title = {Single-Shot Object Detection with Enriched Semantics},
	year = {2018},
	month = {06/2018},
	abstract = {<p>We propose a novel single shot object detection network named Detection with Enriched Semantics (DES). Our motivation is to enrich the semantics of object detection features within a typical deep detector, by a semantic segmentation branch and a global activation module. The segmentation branch is supervised by weak segmentation ground-truth, i.e., no extra annotation is required. In conjunction with that, we employ a global activation module which learns relationship between channels and object classes in a self-supervised manner. Comprehensive experimental results on both PASCAL VOC and MS COCO detection datasets demonstrate the effectiveness of the proposed method. In particular, with a VGG16 based DES, we achieve an mAP of 81.7 on VOC2007 test and an mAP of 32.8 on COCO test-dev with an inference speed of 31.5 milliseconds per image on a Titan Xp GPU. With a lower resolution version, we achieve an mAP of 79.7 on VOC2007 with an inference speed of 13.0 milliseconds per image.</p>
},
	author = {Zhishuai Zhang and Siyuan Qiao and Cihang Xie and Wei Shen and Bo Wang and Alan Yuille}
}
@conference {3549,
	title = {Single-Shot Object Detection with Enriched Semantics},
	booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)},
	year = {2018},
	month = {06/2018},
	address = {Salt Lake City, Utah},
	abstract = {<p>We propose a novel single shot object detection network named Detection with Enriched Semantics (DES). Our motivation is to enrich the semantics of object detection features within a typical deep detector, by a semantic segmentation branch and a global activation module. The segmentation branch is supervised by weak segmentation ground-truth, i.e., no extra annotation is required. In conjunction with that, we employ a global activation module which learns relationship between channels and object classes in a self-supervised manner. Comprehensive experimental results on both PASCAL VOC and MS COCO detection datasets demonstrate the effectiveness of the proposed method. In particular, with a VGG16 based DES, we achieve an mAP of 81.7 on VOC2007 test and an mAP of 32.8 on COCO test-dev with an inference speed of 31.5 milliseconds per image on a Titan Xp GPU. With a lower resolution version, we achieve an mAP of 79.7 on VOC2007 with an inference speed of 13.0 milliseconds per image.</p>
},
	url = {http://cvpr2018.thecvf.com/},
	author = {Zhishuai Zhang and Siyuan Qiao and Cihang Xie and Wei Shen and Bo Wang and Alan Yuille}
}
@article {3545,
	title = {Visual Concepts and Compositional Voting},
	journal = {Annals of Mathematical Sciences and Applications (AMSA)},
	volume = {3},
	year = {2018},
	pages = {151{\textendash}188},
	abstract = {<p>It is very attractive to formulate vision in terms of pattern theory \cite{Mumford2010pattern}, where patterns are defined hierarchically by compositions of elementary building blocks. But applying pattern theory to real world images is currently less successful than discriminative methods such as deep networks. Deep networks, however, are black-boxes which are hard to interpret and can easily be fooled by adding occluding objects. It is natural to wonder whether by better understanding deep networks we can extract building blocks which can be used to develop pattern theoretic models. This motivates us to study the internal representations of a deep network using vehicle images from the PASCAL3D+ dataset. We use clustering algorithms to study the population activities of the features and extract a set of visual concepts which we show are visually tight and correspond to semantic parts of vehicles. To analyze this we annotate these vehicles by their semantic parts to create a new dataset, VehicleSemanticParts, and evaluate visual concepts as unsupervised part detectors. We show that visual concepts perform fairly well but are outperformed by supervised discriminative methods such as Support Vector Machines (SVM). We next give a more detailed analysis of visual concepts and how they relate to semantic parts. Following this, we use the visual concepts as building blocks for a simple pattern theoretical model, which we call compositional voting. In this model several visual concepts combine to detect semantic parts. We show that this approach is significantly better than discriminative methods like SVM and deep networks trained specifically for semantic part detection. Finally, we return to studying occlusion by creating an annotated dataset with occlusion, called VehicleOcclusion, and show that compositional voting outperforms even deep networks when the amount of occlusion becomes large.</p>
},
	keywords = {deep networks, pattern theory, visual concepts},
	doi = {10.4310/AMSA.2018.v3.n1.a5},
	url = {http://www.intlpress.com/site/pub/pages/journals/items/amsa/content/vols/0003/0001/a005/index.html},
	author = {Jianyu Wang and Zhishuai Zhang and Cihang Xie and Yuyin Zhou and Vittal Premachandran and Jun Zhu and Lingxi Xie and Alan Yuille}
}
@article {3594,
	title = {Visual concepts and compositional voting},
	year = {2018},
	month = {03/2018},
	abstract = {<p>It is very attractive to formulate vision in terms of pattern theory [26], where patterns are defined hierarchically by compositions of elementary building blocks. But applying pattern theory to real world images is very challenging and is currently less successful than discriminative methods such as deep networks. Deep networks, however, are black-boxes which are hard to interpret and, as we will show, can easily be fooled by adding occluding objects. It is natural to wonder whether by better under- standing deep networks we can extract building blocks which can be used to develop pattern theoretic models. This motivates us to study the internal feature vectors of a deep network using images of vehicles from the PASCAL3D+ dataset with the scale of objects fixed. We use clustering algorithms, such as K-means, to study the population activity of the features and extract a set of visual concepts which we show are visually tight and correspond to semantic parts of the vehicles. To analyze this in more detail, we annotate these vehicles by their semantic parts to create a new dataset which we call VehicleSemanticParts, and evaluate visual concepts as unsupervised semantic part detectors. Our results show that visual concepts perform fairly well but are outperformed by supervised discriminative methods such as Support Vector Machines. We next give a more detailed analysis of visual concepts and how they relate to semantic parts. Following this analysis, we use the visual concepts as building blocks for a simple pattern theoretical model, which we call compositional voting. In this model several visual concepts combine to detect semantic parts. We show that this approach is significantly better than discriminative methods like Support Vector machines and deep networks trained specifically for semantic part detection. Finally, we return to studying occlusion by creating an annotated dataset with occlusion, called Vehicle Occlusion, and show that compositional voting outperforms even deep networks when the amount of occlusion becomes large.</p>
},
	author = {Jianyu Wang and Zhishuai Zhang and Cihang Xie and Yuyin Zhou and Vittal Premachandran and Jun Zhu and Lingxi Xie and Alan Yuille}
}
@conference {3547,
	title = {Detecting Semantic Parts on Partially Occluded Objects},
	booktitle = {British Machine Vision Conference (BMVC)},
	year = {2017},
	month = {09/2017},
	address = {London, UK},
	abstract = {<p>In this paper, we address the task of detecting semantic parts on partially occluded objects. We consider a scenario where the model is trained using non-occluded images but tested on occluded images. The motivation is that there are infinite number of occlusion patterns in real world, which cannot be fully covered in the training data. So the models should be inherently robust and adaptive to occlusions instead of fitting / learning the occlusion patterns in the training data. Our approach detects semantic parts by accumulating the confidence of local visual cues. Specifically, the method uses a simple voting method, based on log-likelihood ratio tests and spatial constraints, to combine the evidence of local cues. These cues are called visual concepts, which are derived by clustering the internal states of deep networks. We evaluate our voting scheme on the VehicleSemanticPart dataset with dense part annotations. We randomly place two, three or four irrelevant objects onto the target object to generate testing images with various occlusions. Experiments show that our algorithm outperforms several competitors in semantic part detection when occlusions are present.</p>
},
	url = {https://bmvc2017.london/proceedings/},
	author = {Jianyu Wang and Cihang Xie and Zhishuai Zhang and Jun Zhu and Lingxi Xie and Alan Yuille}
}
@article {3503,
	title = {Detecting Semantic Parts on Partially Occluded Objects},
	year = {2017},
	month = {09/2017},
	abstract = {<div>
<div>
<div>
<p>In this paper, we address the task of detecting semantic parts on partially occluded objects. We consider a scenario where the model is trained using non-occluded images but tested on occluded images. The motivation is that there are infinite number of occlusion patterns in real world, which cannot be fully covered in the training data. So the models should be inherently robust and adaptive to occlusions instead of fitting / learning the occlusion patterns in the training data. Our approach detects semantic parts by accumulating the confidence of local visual cues. Specifically, the method uses a simple voting method, based on log-likelihood ratio tests and spatial constraints, to combine the evidence of local cues. These cues are called visual concepts, which are derived by clustering the internal states of deep networks. We evaluate our voting scheme on the VehicleSemanticPart dataset with dense part annotations. We randomly place two, three or four irrelevant objects onto the target object to generate testing images with various occlusions. Experiments show that our algorithm outperforms several competitors in semantic part detection when occlusions are present.</p>
</div>
</div>
</div>
},
	author = {Jianyu Wang and Cihang Xie and Zhishuai Zhang and Jun Zhu and Lingxi Xie and Alan Yuille}
}