@article {3827, title = {Finding any Waldo with zero-shot invariant and efficient visual search}, journal = {Nature Communications}, volume = {9}, year = {2018}, month = {09/2018}, abstract = {

Searching for a target object in a cluttered scene constitutes a fundamental challenge in daily vision. Visual search must be selective enough to discriminate the target from distractors, invariant to changes in the appearance of the target, efficient to avoid exhaustive exploration of the image, and must generalize to locate novel target objects with zero-shot training. Previous work on visual search has focused on searching for perfect matches of a target after extensive category-specific training. Here, we show for the first time that humans can efficiently and invariantly search for natural objects in complex scenes. To gain insight into the mechanisms that guide visual search, we propose a biologically inspired computational model that can locate targets without exhaustive sampling and which can generalize to novel objects. The model provides an approximation to the mechanisms integrating bottom-up and top-down signals during search in natural scenes.

}, doi = {10.1038/s41467-018-06217-x}, url = {http://www.nature.com/articles/s41467-018-06217-x}, author = {Zhang, Mengmi and Feng, Jiashi and Ma, Keng Teck and Lim, Joo Hwee and Qi Zhao and Gabriel Kreiman} } @article {3959, title = {What am I searching for?}, year = {2018}, month = {07/2018}, abstract = {

Can we infer intentions and goals from a person{\textquoteright}s actions? As an example of this family of problems, we consider here whether it is possible to decipher what a person is searching for by decoding their eye movement behavior. We conducted two human psychophysics experiments on object arrays and natural images where we monitored subjects{\textquoteright} eye movements while they were looking for a target object. Using as input the pattern of "error" fixations on non-target objects before the target was found, we developed a model (InferNet) whose goal was to infer what the target was. "Error" fixations share similar features with the sought target. The Infernet model uses a pre-trained 2D convolutional architecture to extract features from the error fixations and computes a 2D similarity map between the error fixation and all locations across the search image by modulating the search image via convolution across layers. InferNet consolidates the modulated response maps across layers via max pooling to keep track of the sub-patterns highly similar to features at error fixations and integrates these maps across all error fixations. InferNet successfully identifies the subject{\textquoteright}s goal and outperforms all the competitive null models, even without any object-specific training on the inference task.

}, author = {Zhang, Mengmi and Feng, Jiashi and Lim, Joo Hwee and Qi Zhao and Gabriel Kreiman} } @inbook {2126, title = {Recognition of occluded objects}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, publisher = {Springer Singapore}, organization = {Springer Singapore}, issn = {978-981-10-0211-3}, url = {http://www.springer.com/us/book/9789811002113}, author = {Hanlin Tang and Gabriel Kreiman and Qi Zhao} } @article {1617, title = {Foveation-based Mechanisms Alleviate Adversarial Examples}, number = {044}, year = {2016}, month = {01/2016}, abstract = {

We show that adversarial examples,\ i.e.,\ the visually imperceptible perturbations that result in Convolutional Neural Networks (CNNs) fail, can be alleviated with a mechanism based on foveations---applying the CNN in different image regions. To see this, first, we report results in ImageNet that lead to a revision of the hypothesis that adversarial perturbations are a consequence of CNNs acting as a linear classifier: CNNs act locally linearly to changes in the image regions with objects recognized by the CNN, and in other regions the CNN may act non-linearly. Then, we corroborate that when the neural responses are linear, applying the foveation mechanism to the adversarial example tends to significantly reduce the effect of the perturbation. This is because, hypothetically, the CNNs for ImageNet are robust to changes of scale and translation of the object produced by the foveation, but this property does not generalize to transformations of the perturbation. As a result, the accuracy after a foveation is almost the same as the accuracy of the CNN without the adversarial perturbation, even if the adversarial perturbation is calculated taking into account a foveation.

}, author = {Luo, Yan and X Boix and Gemma Roig and Tomaso Poggio and Qi Zhao} } @article {390, title = {Predicting Saliency Beyond Pixels}, year = {2014}, month = {01/2014}, abstract = {

A large body of previous models to predict where people look in natural scenes focused on pixel-level image attributes. To bridge the semantic gap between the predictive power of computational saliency models and human behavior, we propose a new saliency architecture that incorporates information at three layers: pixel-level image attributes, object-level attributes, and semantic-level attributes. Object- and semantic-level information is frequently ignored, or only a few sample object categories are discussed where scaling to a large number of object categories is not feasible nor neurally plausible. To address this problem, this work constructs a principled vocabulary of basic attributes to describe object- and semantic-level information thus not restricting to a limited number of object categories. We build a new dataset of 700 images with eye-tracking data of 15 viewers and annotation data of 5551 segmented objects with fine contours and 12 semantic attributes. Experimental results demonstrate the importance of the object- and semantic-level information in the prediction of visual attention.

}, url = {http://www.ece.nus.edu.sg/stfpage/eleqiz/predicting.html}, author = {Juan Xu and Ming Jiang and Shuo Wang and Mohan Kankanhalli and Qi Zhao} }