@article {5318, title = {Specialized Networks for Social Cognition in the Primate Brain}, year = {In Press}, abstract = {

Primates have evolved diverse cognitive capabilities to navigate their complex social world. To understand how the brain implements critical social cognitive abilities, we describe functional specialization in the domains of face processing, social interaction understanding, and mental state attribution. Systems for face processing are specialized from the level of single cells to populations of neurons within brain regions to hierarchically organized networks that extract and represent abstract social information. Such functional specialization is not confined to the sensorimotor periphery but appears to be a pervasive theme of primate brain organization all the way to the apex regions of cortical hierarchies. Circuits processing social information are juxtaposed with parallel systems involved in processing nonsocial information, suggesting common computations applied to different domains. The emerging picture of the neural basis of social cognition is a set of distinct but interacting subnetworks involved in component processes such as face perception and social reasoning, traversing large parts of the primate brain.

}, author = {Winrich Freiwald and Ben Deen and Julia Sliwa and Schwiedrzik, Caspar M} } @article {5483, title = {Compositional Sparsity of Learnable Functions}, number = {145}, year = {2024}, month = {02/2024}, abstract = {

*This paper will\ appear in June/July 2024 in the Bulletin of the American Mathematical Society*

Neural networks have demonstrated impressive success in various domains, raising the question of what fundamental principles underlie the effectiveness of the best AI systems and quite possibly of human intelligence. This perspective argues that compositional sparsity, or the property that a compositional function have "few" constituent functions, each depending on only a small subset of inputs, is a key principle underlying successful learning architectures. Surprisingly, all functions that are efficiently Turing computable have a compositional sparse representation. Furthermore, deep networks that are also sparse can exploit this general property to avoid the {\textquotedblleft}curse of dimensionality". This framework suggests interesting implications about the role that machine learning may play in mathematics.

}, author = {Tomaso Poggio and Maia Fraser} } @article {5257, title = {An adversarial collaboration protocol for testing contrasting predictions of global neuronal workspace and integrated information theory}, journal = {PLOS ONE}, volume = {18}, year = {2023}, month = {02/2023}, pages = {e0268577}, abstract = {

The relationship between conscious experience and brain activity has intrigued scientists and philosophers for centuries. In the last decades, several theories have suggested different accounts for these relationships. These theories have developed in parallel, with little to no cross-talk among them. To advance research on consciousness, we established an adversarial collaboration between proponents of two of the major theories in the field, Global Neuronal Workspace and Integrated Information Theory. Together, we devised and preregistered two experiments that test contrasting predictions of these theories concerning the location and timing of correlates of visual consciousness, which have been endorsed by the theories{\textquoteright} proponents. Predicted outcomes should either support, refute, or challenge these theories. Six theory-impartial laboratories will follow the study protocol specified here, using three complementary methods: Functional Magnetic Resonance Imaging (fMRI), Magneto-Electroencephalography (M-EEG), and intracranial electroencephalography (iEEG). The study protocol will include built-in replications, both between labs and within datasets. Through this ambitious undertaking, we hope to provide decisive evidence in favor or against the two theories and clarify the footprints of conscious visual perception in the human brain, while also providing an innovative model of large-scale, collaborative, and open science practice.

}, doi = {10.1371/journal.pone.0268577}, url = {https://dx.plos.org/10.1371/journal.pone.0268577}, author = {Melloni, Lucia and Mudrik, Liad and Pitts, Michael and Bendtz, Katarina and Ferrante, Oscar and Gorska, Urszula and Hirschhorn, Rony and Khalaf, Aya and Kozma, Csaba and Lepauvre, Alex and Liu, Ling and Mazumder, David and Richter, David and Zhou, Hao and Blumenfeld, Hal and Boly, Melanie and Chalmers, David J. and Devore, Sasha and Fallon, Francis and de Lange, Floris P. and Jensen, Ole and Kreiman, Gabriel and Luo, Huan and Panagiotaropoulos, Theofanis I. and Dehaene, Stanislas and Koch, Christof and Tononi, Giulio}, editor = {Ward, Lawrence M} } @article {5255, title = {BrainBERT: Self-supervised representation learning for Intracranial Electrodes}, year = {2023}, month = {02/2023}, address = {Kigali, Rwanda, Africa}, abstract = {

We create a reusable Transformer, BrainBERT, for intracranial recordings bringing modern representation learning approaches to neuroscience. Much like in NLP and speech recognition, this Transformer enables classifying complex concepts, i.e., decoding neural data, with higher accuracy and with much less data by being pretrained in an unsupervised manner on a large corpus of unannotated neural recordings. Our approach generalizes to new subjects with electrodes in new positions and to unrelated tasks showing that the representations robustly disentangle the neural signal. Just like in NLP where one can study language by investigating what a language model learns, this approach opens the door to investigating the brain by what a model of the brain learns. As a first step along this path, we demonstrate a new analysis of the intrinsic dimensionality of the computations in different areas of the brain. To construct these representations, we combine a technique for producing super-resolution spectrograms of neural data with an approach designed for generating contextual representations of audio by masking. In the future, far more concepts will be decodable from neural recordings by using representation learning, potentially unlocking the brain like language models unlocked language.

}, keywords = {decoding, language models, Neuroscience, self-supervision, transformer}, url = {https://openreview.net/forum?id=xmcYx_reUn6}, author = {Christopher Wang and Vighnesh Subramaniam and Adam Uri Yaari and Gabriel Kreiman and Boris Katz and Ignacio Cases and Andrei Barbu} } @book {5431, title = {Cervelli menti algoritmi}, year = {2023}, month = {10/2023}, pages = {272}, publisher = {Sperling \& Kupfer}, organization = {Sperling \& Kupfer}, abstract = {

L{\textquoteright}intelligenza - quella cosa con cui capiamo il mondo - {\`e} un mistero ancora aperto. Se soltanto noi umani abbiamo un linguaggio, un alfabeto, una scienza non vuol dire che deteniamo il monopolio dell{\textquoteright}intelligenza. Condividiamo questa esistenza con milioni di altre specie, animali e vegetali, dotate di un tale ventaglio di capacit{\`a} cognitive da comporre una gradazione quasi infinita di intelligenze. All{\textquoteright}improvviso, il loro numero si {\`e} messo a crescere. Grazie all{\textquoteright}apparizione congiunta di algoritmi pi{\`u} sofisticati, di oceaniche basi di dati e di un{\textquoteright}enorme potenza di calcolo, l{\textquoteright}antica aspirazione di replicare matematicamente l{\textquoteright}intelligenza umana ha raggiunto traguardi inaspettati. Seppur lontano dal riuscirci, un piccolo zoo di intelligenze artificiali {\`e} gi{\`a} in grado di svolgere numerosi compiti tipicamente umani. In questo libro, un giornalista e un pioniere dell{\textquoteright}intelligenza artificiale raccontano (con la voce dello scienziato) degli albori di una nuova tecnologia {\guillemotleft}generale{\guillemotright} che, come l{\textquoteright}elettricit{\`a} o il computer, {\`e} destinata a trasformare la societ{\`a}, l{\textquoteright}economia e la vita quotidiana, con un carico di rischi e di opportunit{\`a}. Cosa ci dobbiamo aspettare da questa straordinaria evoluzione? Che cosa guadagneremo e che cosa perderemo? Non esistono risposte sicure. Ma {\`e} certamente l{\textquoteright}occasione per nuove, straordinarie scoperte scientifiche. A cominciare dai segreti dell{\textquoteright}intelligenza stessa.

[google translated]

Intelligence {\textendash} that thing with which we understand the world {\textendash} is still an open mystery. If only we humans have a language, an alphabet, a science does not mean that we hold the monopoly of intelligence. We share this existence with millions of other species, animals and plants, endowed with such a range of cognitive abilities to compose an almost infinite gradation of intelligence. Suddenly, their numbers started to grow. Thanks to the joint appearance of more sophisticated algorithms, ocean databases and enormous computing power, the ancient aspiration to mathematically replicate human intelligence has reached unexpected milestones. Although far from succeeding, a small zoo of artificial intelligence is already able to perform numerous typically human tasks. In this book, a journalist and a pioneer of artificial intelligence tell (with the scientist{\textquoteright}s voice) of the dawn of a new {\textquotedblleft}general{\textquotedblright} technology that, like electricity or computer, is intended to transform society, the economy and everyday life, with a load of risks and opportunities. What should we expect from this extraordinary evolution? What will we gain and what will we lose? There are no safe answers. But it is certainly an opportunity for new, extraordinary scientific discoveries. Let{\textquoteright}s start with the secrets of intelligence itself.

}, isbn = {9788820077761}, url = {https://www.sperling.it/libri/cervelli-menti-algoritmi-marco-magrini}, author = {Tomaso Poggio and Marco Magrini} } @article {5313, title = {CNNs reveal the computational implausibility of the expertise hypothesis}, journal = {iScience}, volume = {26}, year = {2023}, month = {02/2023}, pages = {105976}, abstract = {

Face perception has long served as a classic example of domain specificity of mind and brain. But an alternative {\textquotedblleft}expertise{\textquotedblright} hypothesis holds that putatively face-specific mechanisms are actually domain-general, and can be recruited for the\ perception of other objects of expertise (e.g., cars for car experts). Here, we demonstrate the computational implausibility of this hypothesis: Neural network\ models optimized for generic object categorization provide a better foundation for expert fine-grained discrimination than do models optimized for face recognition.

}, issn = {25890042}, doi = {10.1016/j.isci.2023.105976}, url = {https://linkinghub.elsevier.com/retrieve/pii/S2589004223000536}, author = {Kanwisher, Nancy and Gupta, Pranjul and Dobs, Katharina} } @article {5253, title = {Cross-task specificity and within-task invariance of cognitive control processes}, journal = {Cell Reports}, volume = {42}, year = {2023}, month = {01/2023}, pages = {111919}, abstract = {

Cognitive control involves flexibly combining multiple sensory inputs with task-dependent goals during decision making. Several tasks involving conflicting sensory inputs and motor outputs have been proposed to examine cognitive control, including the Stroop, Flanker, and multi-source interference task. Because these tasks have been studied independently, it remains unclear whether the neural signatures of cognitive control reflect abstract control mechanisms or specific combinations of sensory and behavioral aspects of each task. To address these questions, we record invasive neurophysiological signals from 16 patients with pharmacologically intractable epilepsy and compare neural responses within and between tasks. Neural signals differ between incongruent and congruent conditions, showing strong modulation by conflicting task demands. These neural signals are mostly specific to each task, generalizing within a task but not across tasks. These results highlight the complex interplay between sensory inputs, motor outputs, and task demands underlying cognitive control processes.

}, issn = {22111247}, doi = {10.1016/j.celrep.2022.111919}, url = {https://linkinghub.elsevier.com/retrieve/pii/S2211124722018174}, author = {Xiao, Yuchen and Chou, Chien-Chen and Cosgrove, Garth Rees and Crone, Nathan E. and Stone, Scellig and Madsen, Joseph R. and Reucroft, Ian and Shih, Yen-Cheng and Weisholtz, Daniel and Yu, Hsiang-Yu and Anderson, William S. and Kreiman, Gabriel} } @article {5214, title = {Decoding of human identity by computer vision and neuronal vision}, journal = {Scientific Reports}, volume = {13}, year = {2023}, month = {01/2023}, abstract = {

Extracting meaning from a dynamic and variable flow of incoming information is a major goal of both natural and artificial intelligence. Computer vision (CV) guided by deep learning (DL) has made significant strides in recognizing a specific identity despite highly variable attributes. This is the same challenge faced by the nervous system and partially addressed by the concept cells{\textemdash}neurons exhibiting selective firing in response to specific persons/places, described in the human medial temporal lobe (MTL) . Yet, access to neurons representing a particular concept is limited due to these neurons{\textquoteright} sparse coding. It is conceivable, however, that the information required for such decoding is present in relatively small neuronal populations. To evaluate how well neuronal populations encode identity information in natural settings, we recorded neuronal activity from multiple brain regions of nine neurosurgical epilepsy patients implanted with depth electrodes, while the subjects watched an episode of the TV series {\textquotedblleft}24{\textquotedblright}. First, we devised a minimally supervised CV algorithm (with comparable performance against manually-labeled data) to detect the most prevalent characters (above 1\% overall appearance) in each frame. Next, we implemented DL models that used the time- varying population neural data as inputs and decoded the visual presence of the four main characters throughout the episode. This methodology allowed us to compare {\textquotedblleft}computer vision{\textquotedblright} with {\textquotedblleft}neuronal vision{\textquotedblright}{\textemdash}footprints associated with each character present in the activity of a subset of neurons{\textemdash}and identify the brain regions that contributed to this decoding process. We then tested the DL models during a recognition memory task following movie viewing where subjects were asked to recognize clip segments from the presented episode. DL model activations were not only modulated by the presence of the corresponding characters but also by participants{\textquoteright} subjective memory of whether they had seen the clip segment, and by the associative strengths of the characters in the narrative plot. The described approach can offer novel ways to probe the representation of concepts in time-evolving dynamic behavioral tasks. Further, the results suggest that the information required to robustly decode concepts is present in the population activity of only tens of neurons even in brain regions beyond MTL.

}, doi = {10.1038/s41598-022-26946-w}, url = {https://www.nature.com/articles/s41598-022-26946-w}, author = {Zhang, Yipeng and Aghajan, Zahra M. and Ison, Matias and Lu, Qiujing and Tang, Hanlin and Kalender, Guldamla and Monsoor, Tonmoy and Zheng, Jie and Kreiman, Gabriel and Roychowdhury, Vwani and Fried, Itzhak} } @article {5315, title = {Decoding of human identity by computer vision and neuronal visionAbstract}, journal = {Scientific Reports}, volume = {13}, year = {2023}, month = {01/2023}, abstract = {

Extracting meaning from a dynamic and variable flow of incoming information is a major goal of both natural and artificial intelligence. Computer vision (CV) guided by deep learning (DL) has made significant strides in recognizing a specific identity despite highly variable attributes. This is the same challenge faced by the nervous system and partially addressed by the concept cells{\textemdash}neurons exhibiting selective firing in response to specific persons/places, described in the human medial temporal lobe (MTL) . Yet, access to neurons representing a particular concept is limited due to these neurons{\textquoteright} sparse coding. It is conceivable, however, that the information required for such decoding is present in relatively small neuronal populations. To evaluate how well neuronal populations encode identity information in natural settings, we recorded neuronal activity from multiple brain regions of nine neurosurgical epilepsy patients implanted with depth electrodes, while the subjects watched an episode of the TV series {\textquotedblleft}24{\textquotedblright}. First, we devised a minimally supervised CV algorithm (with comparable performance against manually-labeled data) to detect the most prevalent characters (above 1\% overall appearance) in each frame. Next, we implemented DL models that used the time- varying population neural data as inputs and decoded the visual presence of the four main characters throughout the episode. This methodology allowed us to compare {\textquotedblleft}computer vision{\textquotedblright} with {\textquotedblleft}neuronal vision{\textquotedblright}{\textemdash}footprints associated with each character present in the activity of a subset of neurons{\textemdash}and identify the brain regions that contributed to this decoding process. We then tested the DL models during a recognition memory task following movie viewing where subjects were asked to recognize clip segments from the presented episode. DL model activations were not only modulated by the presence of the corresponding characters but also by participants{\textquoteright} subjective memory of whether they had seen the clip segment, and by the associative strengths of the characters in the narrative plot. The described approach can offer novel ways to probe the representation of concepts in time-evolving dynamic behavioral tasks. Further, the results suggest that the information required to robustly decode concepts is present in the population activity of only tens of neurons even in brain regions beyond MTL.

}, doi = {10.1038/s41598-022-26946-w}, url = {https://www.nature.com/articles/s41598-022-26946-w}, author = {Zhang, Yipeng and Aghajan, Zahra M. and Ison, Matias and Lu, Qiujing and Tang, Hanlin and Kalender, Guldamla and Monsoor, Tonmoy and Zheng, Jie and Kreiman, Gabriel and Roychowdhury, Vwani and Fried, Itzhak} } @article {5217, title = {Dynamics in Deep Classifiers trained with the Square Loss: normalization, low rank, neural collapse and generalization bounds}, journal = {Research}, year = {2023}, month = {01/2023}, abstract = {

We overview several properties {\textendash} old and new {\textendash} of training overparametrized deep networks under the square loss. We first consider a model of the dynamics of gradient flow under the square loss in deep homogeneous ReLU networks. We study the convergence to a solution with the absolute minimum ρ, which is the product of the Frobenius norms of each layer weight matrix, when normalization by Lagrange multipliers (LM) is used together with Weight Decay (WD) under different forms of gradient descent. A main property of the minimizers that bounds their expected error for a specific network architecture is ρ. In particular, we derive novel norm- based bounds for convolutional layers that are orders of magnitude better than classical bounds for dense networks. Next we prove that quasi-interpolating solutions obtained by Stochastic Gradient Descent (SGD) in the presence of WD have a bias towards low rank weight matrices {\textendash} that should improve generalization. The same analysis predicts the existence of an inherent SGD noise for deep networks. In both cases, we verify our predictions experimentally. We then predict Neural Collapse and its properties without any specific assumption {\textendash} unlike other published proofs. Our analysis supports the idea that the advantage of deep networks relative to other classifiers is greater for problems that are appropriate for sparse deep architectures such as CNNs. The reason is that compositionally sparse target functions can be approximated well by {\textquotedblleft}sparse{\textquotedblright} deep networks without incurring in the curse of dimensionality.

}, doi = {10.34133/research.0024}, url = {https://spj.science.org/doi/10.34133/research.0024}, author = {Xu, Mengjia and Rangamani, Akshay and Liao, Qianli and Galanti, Tomer and Poggio, Tomaso} } @article {5358, title = {Emotion prediction as computation over a generative theory of mind}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, volume = {381}, year = {2023}, month = {07/2023}, abstract = {

From sparse descriptions of events, observers can make systematic and\ nuanced predictions of what emotions the people involved\ will experience. We propose a formal model of emotion prediction in the context of a public high-stakes social dilemma. This model uses inverse planning to infer a person{\textquoteright}s beliefs and preferences, including social preferences for equity and for maintaining a good reputation. The model then combines these inferred mental contents with the event to compute {\textquoteleft}appraisals{\textquoteright}: whether the situation conformed to the expectations and fulfilled the preferences. We learn functions mapping computed appraisals to emotion labels, allowing the model to match human observers{\textquoteright} quantitative predictions of 20 emotions, including joy, relief, guilt and envy. Model comparison indicates that inferred monetary preferences are not sufficient to explain observers{\textquoteright} emotion predictions; inferred social preferences are factored into predictions for nearly every emotion. Human observers and the model both use minimal individualizing information to adjust predictions of how different people will respond to the same event. Thus, our framework integrates inverse planning, event appraisals and emotion concepts in a single computational model to reverse-engineer people{\textquoteright}s intuitive theory of emotions.

}, keywords = {affective computing, emotion, inverse planning, probabilistic generative model, social intelligence, theory of mind}, issn = {1364-503X}, doi = {10.1098/rsta.2022.0047}, url = {https://royalsocietypublishing.org/doi/10.1098/rsta.2022.0047}, author = {Houlihan, Sean Dae and Kleiman-Weiner, Max and Hewitt, Luke B. and Tenenbaum, Joshua B. and Saxe, Rebecca} } @article {5312, title = {An empirical assay of view-invariant object learning in humans and comparison with baseline image-computable models}, journal = {bioRxiv}, year = {2023}, month = {01/2023}, abstract = {

How humans learn new visual objects is a longstanding scientific problem. Previous work has led to a diverse collection of models for how it is accomplished, but a current limitation in the field is a lack of empirical benchmarks which can be used to evaluate and compare specific models against each other. Here, we use online psychophysics to measure human behavioral learning trajectories over a set of tasks involving novel 3D objects. Consistent with intuition, these results show that humans generally require very few images (≈ 6) to approach their asymptotic accuracy, find some object discriminations more easy to learn than others, and generalize quite well over a range of image transformations after even one view of each object. We then use those data to develop benchmarks that may be used to evaluate a learning model{\textquoteright}s similarity to humans. We make these data and benchmarks publicly available [GitHub], and, to our knowledge, they are currently the largest publicly-available collection of learning-related psychophysics data in humans. Additionally, to serve as baselines for those benchmarks, we implement and test a large number of baseline models (n=1,932), each based on a standard cognitive theory of learning: that humans re-represent images in a fixed, Euclidean space, then learn linear decision boundaries in that space to identify objects in future images. We find some of these baseline models make surprisingly accurate predictions. However, we also find reliable prediction gaps between all baseline models and humans, particularly in the few-shot learning setting.

}, url = {https://www.biorxiv.org/content/10.1101/2022.12.31.522402v1}, author = {Michael J. Lee and James J. DiCarlo} } @article {5244, title = {Feature learning in deep classifiers through Intermediate Neural Collapse}, number = {141}, year = {2023}, month = {02/2023}, abstract = {

In this paper, we conduct an empirical study of the feature learning process in deep classifiers. Recent research has identified a training phenomenon called Neural Collapse (NC), in which the top-layer feature embeddings of samples from the same class tend to concentrate around their means, and the top layer{\textquoteright}s weights align with those features. Our study aims to investigate if these properties extend to intermediate layers. We empirically study the evolution of the covariance and mean of representations across different layers and show that as we move deeper into a trained neural network, the within-class covariance decreases relative to the between-class covariance. Additionally, we find that in the top layers, where the between-class covariance is dominant, the subspace spanned by the class means aligns with the subspace spanned by the most significant singular vector components of the weight matrix in the corresponding layer. Finally, we discuss the relationship between NC and Associative Memories (Willshaw et al., 1969).

}, author = {Akshay Rangamani and Marius Lindegaard and Tomer Galanti and Tomaso Poggio} } @article {5241, title = {For interpolating kernel machines, minimizing the norm of the ERM solution maximizes stability}, journal = {Analysis and Applications}, volume = {21}, year = {2023}, month = {01/2023}, pages = {193 - 215}, abstract = {

In this paper, we study kernel ridge-less regression, including the case of interpolating solutions. We prove that maximizing the leave-one-out (CVloo) stability minimizes the expected error. Further, we also prove that the minimum norm solution {\textemdash} to which gradient algorithms are known to converge {\textemdash} is the most stable solution. More precisely, we show that the minimum norm interpolating solution minimizes a bound on CVloo stability, which in turn is controlled by the smallest singular value, hence the condition number, of the empirical kernel matrix. These quantities can be characterized in the asymptotic regime where both the dimension (d) and cardinality (n) of the data go to infinity (with nd{\textrightarrow}γ as d,n{\textrightarrow}$\infty$). Our results suggest that the property of CVloostability of the learning algorithm with respect to perturbations of the training set may provide a more general framework than the classical theory of Empirical Risk Minimization (ERM). While ERM was developed to deal with the classical regime in which the architecture of the learning network is fixed and n{\textrightarrow}$\infty$, the modern regime focuses on interpolating regressors and overparameterized models, when both d and n go to infinity. Since the stability framework is known to be equivalent to the classical theory in the classical regime, our results here suggest that it may be interesting to extend it beyond kernel regression to other overparameterized algorithms such as deep networks.

}, keywords = {Algorithmic stability, high dimensional statistics, kernel regression, minimum norm interpolation, overparameterization}, issn = {0219-5305}, doi = {10.1142/S0219530522400115}, url = {https://www.worldscientific.com/doi/10.1142/S0219530522400115}, author = {Rangamani, Akshay and Rosasco, Lorenzo and Poggio, Tomaso} } @article {5259, title = {Forward learning with top-down feedback: empirical and analytical characterization}, journal = {arXiv}, year = {2023}, month = {02/2023}, abstract = {

{\textquotedblleft}Forward-only{\textquotedblright} algorithms, which train neural networks while avoiding a backward pass, have recently gained attention as a way of solving the biologically unrealistic aspects of backpropagation. Here, we first discuss the similarities between two {\textquotedblleft}forward-only{\textquotedblright} algorithms, the Forward- Forward and PEPITA frameworks, and demonstrate that PEPITA is equivalent to a Forward- Forward framework with top-down feedback connections. Then, we focus on PEPITA to address compelling challenges related to the {\textquotedblleft}forward- only{\textquotedblright} rules, which include providing an analytical understanding of their dynamics and reducing the gap between their performance and that of backpropagation. We propose a theoretical analysis of the dynamics of PEPITA. In particular, we show that PEPITA is well-approximated by an {\textquotedblleft}adaptive-feedback-alignment{\textquotedblright} algorithm and we analytically track its performance during learning in a prototype high-dimensional setting. Finally, we develop a strategy to apply the weight mirroring algorithm on {\textquotedblleft}forward-only{\textquotedblright} algorithms with top-down feedback and we show how it impacts PEPITA{\textquoteright}s accuracy and convergence rate.

}, url = {https://arxiv.org/abs/2302.05440}, author = {Ravi Francesco Srinivasan and Francesca Mignacco and Martino Sorbaro and Maria Refinetti and Gabriel Kreiman and Giorgia Dellaferrera} } @article {5406, title = {A Homogeneous Transformer Architecture}, number = {143}, year = {2023}, month = {09/2023}, abstract = {

While the Transformer architecture has made a substantial impact in the field of machine learning, it is unclear what purpose each component serves in the overall architecture. Heterogeneous nonlinear circuits such as multi-layer RELU networks are interleaved with layers of soft-max units. We introduce here a homogeneous architecture based on Hyper Radial Basis Function (HyperBF) units. Evalua- tions on CIFAR10, CIFAR100, and Tiny ImageNet demonstrate a performance comparable to standard vision transformers.

}, author = {Yulu Gan and Tomaso A. Poggio} } @article {5290, title = {Implicit regularization with strongly convex bias: Stability and acceleration}, journal = {Analysis and Applications}, volume = {21}, year = {2023}, month = {-01/2023}, pages = {165 - 191}, abstract = {

Implicit regularization refers to the property of optimization algorithms to be biased towards a certain class of solutions. This property is relevant to understand the behavior of modern machine learning algorithms as well as to design efficient computational methods. While the case where the bias is given by a Euclidean norm is well understood, implicit regularization schemes for more general classes of biases are much less studied. In this work, we consider the case where the bias is given by a strongly convex functional, in the context of linear models, and data possibly corrupted by noise. In particular, we propose and analyze accelerated optimization methods and highlight a trade-off between convergence speed and stability. Theoretical findings are complemented by an empirical analysis on high-dimensional inverse problems in machine learning and signal processing, showing excellent results compared to the state of the art.

}, issn = {0219-5305}, doi = {10.1142/S0219530522400139}, url = {https://www.worldscientific.com/doi/10.1142/S0219530522400139}, author = {Villa, Silvia and Matet, Simon and Vu, Bằng C{\^o}ng and Rosasco, Lorenzo} } @article {5293, title = {Infants and toddlers leverage their understanding of action goals to evaluate agents who help others}, journal = {Child Development}, year = {2023}, month = {02/2023}, abstract = {

Why do infants and toddlers prefer helpers? Four experiments (conducted from 2019{\textendash}2022; n = 136, 66\% White, 15\% Asian, 4\% Black, 2\% Hispanic/Latino, 13\% multiracial, majority USA) investigated whether infants and toddlers favor agents whose actions allow others to achieve their goals. In the key experiment, 8-month-old infants and 15-month-old toddlers viewed a protagonist who tried and failed to open a box that contained a toy while two other agents (helpers) observed; then the toys were exchanged and the helpers opened different boxes. Infants and toddlers differently evaluated the two helpers, consistent with their developing means-end understanding. Together, the present four experiments connect infants{\textquoteright} and toddlers{\textquoteright} evaluations of helping to their understanding of goal-directed behavior.

}, issn = {0009-3920}, doi = {10.1111/cdev.13895}, url = {https://onlinelibrary.wiley.com/doi/10.1111/cdev.13895}, author = {Woo, Brandon M. and Spelke, Elizabeth S.} } @article {5466, title = {The Janus effects of SGD vs GD: high noise and low rank}, number = {144}, year = {2023}, month = {12/2024}, abstract = {

It was always obvious that \ SGD with small minibatch size yields for neural networks much higher asymptotic fluctuations in the updates of the weight matrices than GD. It has also been often reported that SGD in deep RELU networks shows empirically a low-rank bias in the weight matrices. A recent \ theoretical analysis derived a bound on the rank and linked it to the size of the SGD fluctuations [25]. In this paper, we provide an empirical and \ theoretical analysis of the convergence of SGD vs GD, first for deep RELU networks and then for the case of linear regression, where sharper estimates can be obtained and which is of independent interest. In the linear case, we prove that the component $W^\perp$ of the matrix $W$ corresponding to the null space of the data matrix $X$ converges to zero for both SGD and GD, provided the regularization term is non-zero. Because of the larger number of updates required to go through all the training data, the convergence rate {\it per epoch} of these components is much faster for SGD than for GD. In practice, SGD has a much stronger bias than GD towards solutions for weight matrices $W$ with high fluctuations -- even when the choice of mini batches is deterministic -- and low rank, provided the initialization is from a random matrix. Thus SGD \ with non-zero regularization, shows the coupled phenomenon of \ asymptotic noise and a low-rank bias-- unlike GD.

}, author = {Mengjia Xu and Tomer Galanti and Akshay Rangamani and Lorenzo Rosasco and Andrea Pinto and Tomaso Poggio} } @article {5460, title = {Many but not all deep neural network audio models capture brain responses and exhibit correspondence between model stages and brain regions}, journal = {PLOS Biology}, volume = {21}, year = {2023}, month = {12/2023}, pages = {e3002366}, abstract = {

Models that predict brain responses to stimuli provide one measure of understanding of a sensory system and have many potential applications in science and engineering. Deep artificial neural networks have emerged as the leading such predictive models of the visual system but are less explored in audition. Prior work provided examples of audio-trained neural networks that produced good predictions of auditory cortical fMRI responses and exhibited correspondence between model stages and brain regions, but left it unclear whether these results generalize to other neural network models and, thus, how to further improve models in this domain. We evaluated model-brain correspondence for publicly available audio neural network models along with in-house models trained on 4 different tasks. Most tested models outpredicted standard spectromporal filter-bank models of auditory cortex and exhibited systematic model-brain correspondence: Middle stages best predicted primary auditory cortex, while deep stages best predicted non-primary cortex. However, some state-of-the-art models produced substantially worse brain predictions. Models trained to recognize speech in background noise produced better brain predictions than models trained to recognize speech in quiet, potentially because hearing in noise imposes constraints on biological auditory representations. The training task influenced the prediction quality for specific cortical tuning properties, with best overall predictions resulting from models trained on multiple tasks. The results generally support the promise of deep neural networks as models of audition, though they also indicate that current models do not explain auditory cortical responses in their entirety.

}, doi = {10.1371/journal.pbio.3002366}, url = {https://dx.plos.org/10.1371/journal.pbio.3002366}, author = {Tuckute, Greta and Feather, Jenelle and Boebinger, Dana and McDermott, Josh H.}, editor = {Poeppel, David} } @article {5435, title = {Minute-scale periodicity of neuronal firing in the human entorhinal cortex}, journal = {Cell Reports}, volume = {42}, year = {2023}, month = {11/2023}, pages = {113271}, abstract = {

Grid cells in the entorhinal cortex demonstrate spatially periodic firing, thought to provide a spatial map on behaviorally relevant length scales. Whether such periodicity exists for behaviorally relevant time scales in the human brain remains unclear. We investigate neuronal firing during a temporally continuous experience by presenting 14 neurosurgical patients with a video while recording neuronal activity from multiple brain regions. We report on neurons that modulate their activity in a periodic manner across different time scales{\textemdash}from seconds to many minutes, most prevalently in the entorhinal cortex. These neurons remap their dominant periodicity to shorter time scales during a subsequent recognition memory task. When the video is presented at two different speeds, a significant percentage of these temporally periodic cells (TPCs) maintain their time scales, suggesting a degree of invariance. The TPCs{\textquoteright} temporal periodicity might complement the spatial periodicity of grid cells and together provide scalable spatiotemporal metrics for human experience.

}, issn = {22111247}, doi = {10.1016/j.celrep.2023.113271}, url = {https://linkinghub.elsevier.com/retrieve/pii/S2211124723012834}, author = {M. Aghajan, Zahra and Kreiman, Gabriel and Fried, Itzhak} } @article {5428, title = {Model metamers reveal divergent invariances between biological and artificial neural networks}, journal = {Nature Neuroscience}, year = {2023}, month = {10/2023}, abstract = {

Deep neural network models of sensory systems are often proposed to learn representational transformations with invariances like those in the brain. To reveal these invariances, we generated {\textquoteleft}model metamers{\textquoteright}, stimuli whose activations within a model stage are matched to those of a natural stimulus. Metamers for state-of-the-art supervised and unsupervised neural network models of vision and audition were often completely unrecognizable to humans when generated from late model stages, suggesting differences between model and human invariances. Targeted model changes improved human recognizability of model metamers but did not eliminate the overall human{\textendash}model discrepancy. The human recognizability of a model{\textquoteright}s metamers was well predicted by their recognizability by other models, suggesting that models contain idiosyncratic invariances in addition to those required by the task. Metamer recognizability dissociated from both traditional brain-based benchmarks and adversarial vulnerability, revealing a distinct failure mode of existing sensory models and providing a complementary benchmark for model assessment.

}, issn = {1097-6256}, doi = {10.1038/s41593-023-01442-0}, url = {https://www.nature.com/articles/s41593-023-01442-0}, author = {Feather, Jenelle and Leclerc, Guillaume and M{\k a}dry, Aleksander and McDermott, Josh H.} } @article {5319, title = {The neural architecture of theory-based reinforcement learning}, journal = {Neuron}, volume = {111}, year = {2023}, month = {03/2023}, pages = {1331 - 1344.e8}, abstract = {

Humans learn internal models of the world that support planning and generalization in complex environments. Yet it remains unclear how such internal models are represented and learned in the brain. We approach this question using theory-based reinforcement learning, a strong form of model-based reinforcement learning in which the model is a kind of intuitive theory. We analyzed fMRI data from human participants learning to play Atari-style games. We found evidence of theory representations in prefrontal cortex and of theory updating in prefrontal cortex, occipital cortex, and fusiform gyrus. Theory updates coincided with transient strengthening of theory representations. Effective connectivity during theory updating suggests that information flows from prefrontal theory-coding regions to posterior theory-updating regions. Together, our results are consistent with a neural architecture in which top-down theory representations originating in prefrontal regions shape sensory predictions in visual areas, where factored theory prediction errors are computed and trigger bottom-up updates of the theory.

}, issn = {08966273}, doi = {10.1016/j.neuron.2023.01.023}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627323000739}, author = {Tomov, Momchil S. and Tsividis, Pedro A. and Pouncy, Thomas and Tenenbaum, Joshua B. and Gershman, Samuel J.} } @article {5314, title = {Neural coding: Stimulating cortex to alter visual perception}, journal = {Current Biology}, volume = {33}, year = {2023}, month = {02/2023}, pages = {R117 - R118}, abstract = {

A new study has shown that monkeys detect transient external pulses delivered to the highest echelons of visual cortex in a way that depends on concomitant visual inputs. This new work, a technical tour de force, has implications for the development of future visual prosthetic devices.

}, issn = {09609822}, doi = {10.1016/j.cub.2022.12.047}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0960982222019844}, author = {Kreiman, Gabriel} } @article {5368, title = {Non-commitment in mental imagery}, journal = {Cognition}, volume = {238}, year = {2023}, month = {09/2023}, pages = {105498}, abstract = {

We examine non-commitment in the imagination. Across 5 studies (N\ \>\ 1, 800), we find that most people are non-committal about basic aspects of their mental images, including features that would be readily apparent in real images. While previous work on the imagination has discussed the possibility of non-commitment, this paper is the first, to our knowledge, to examine this systematically and empirically. We find that people do not commit to basic properties of specified mental scenes (Studies 1 and 2), and that people report non-commitment rather than uncertainty or forgetfulness (Study 3). Such non-commitment is present even for people with generally vivid imaginations, and those who report imagining the specified scene very vividly (Studies 4a, 4b). People readily confabulate properties of their mental images when non-commitment is not offered as an explicit option (Study 5). Taken together, these results establish non-commitment as a pervasive component of mental imagery.

}, issn = {00100277}, doi = {10.1016/j.cognition.2023.105498}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027723001324}, author = {Bigelow, Eric J. and McCoy, John P. and Ullman, Tomer D.} } @article {5316, title = {NOPA: Neurally-guided Online Probabilistic Assistance for Building Socially Intelligent Home Assistants}, journal = {arXiv}, year = {2023}, month = {01/2023}, abstract = {

In this work, we study how to build socially intelligent robots to assist people in their homes. In particular, we focus on assistance with online goal inference, where robots must simultaneously infer humans{\textquoteright} goals and how to help them achieve those goals. Prior assistance methods either lack the adaptivity to adjust helping strategies (i.e., when and how to help) in response to uncertainty about goals or the scalability to conduct fast inference in a large goal space. Our NOPA (Neurally-guided Online Probabilistic Assistance) method addresses both of these challenges. NOPA consists of (1) an online goal inference module combining neural goal proposals with inverse planning and particle filtering for robust inference under uncertainty, and (2) a helping planner that discovers valuable subgoals to help with and is aware of the uncertainty in goal inference. We compare NOPA against multiple baselines in a new embodied AI assistance challenge: Online Watch-And-Help, in which a helper agent needs to simultaneously watch a main agent{\textquoteright}s action, infer its goal, and help perform a common household task faster in realistic virtual home environments. Experiments show that our helper agent robustly updates its goal inference and adapts its helping plans to the changing level of uncertainty.

}, url = {https://arxiv.org/abs/2301.05223}, author = {Xavier Puig and Tianmin Shu and Joshua B. Tenenbaum and Torralba, Antonio} } @article {5242, title = {Norm-Based Generalization Bounds for Compositionally Sparse Neural Networks}, year = {2023}, abstract = {

In this paper, we investigate the Rademacher complexity of deep sparse neural networks, where each neuron receives a small number of inputs. We prove generalization bounds for multilayered sparse ReLU neural networks, including convolutional neural networks. These bounds differ from previous ones, as they consider the norms of the convolutional filters instead of the norms of the associated Toeplitz matrices, independently of weight sharing between neurons.

As we show theoretically, these bounds may be orders of magnitude better than standard norm-based generalization bounds and empirically, they are almost non-vacuous in estimating generalization in various simple classification problems. Taken together, these results suggest that compositional sparsity of the underlying target function is critical to the success of deep neural networks.

}, author = {Tomer Galanti and Mengjia Xu and Liane Galanti and Tomaso Poggio} } @conference {5496, title = {Norm-based Generalization Bounds for Sparse Neural Networks}, booktitle = {NeurIPS 2023}, year = {2023}, month = {12/2023}, address = {New Orleans}, abstract = {

In this paper, we derive norm-based generalization bounds for sparse ReLU neural networks, including convolutional neural networks. These bounds differ from previous ones because they consider the sparse structure of the neural network architecture and the norms of the convolutional filters, rather than the norms of the (Toeplitz) matrices associated with the convolutional layers. Theoretically, we demonstrate that these bounds are significantly tighter than standard norm-based generalization bounds. Empirically, they offer relatively tight estimations of generalization for various simple classification problems. Collectively, these findings suggest that the sparsity of the underlying target function and the model{\textquoteright}s architecture plays a crucial role in the success of deep learning.

}, url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/8493e190ff1bbe3837eca821190b61ff-Paper-Conference.pdf}, author = {Tomer Galanti and Mengjia Xu and Liane Galanti and Tomaso Poggio} } @article {5321, title = {Out of sight, out of mind: Responses in primate ventral visual cortex track individual fixations during natural vision}, journal = {bioRxiv}, year = {2023}, month = {02/2023}, abstract = {

During natural vision, primates shift their gaze several times per second with large, ballistic eye movements known as saccades. Open questions remain as to whether visual neurons retain their classical retinotopic response properties during natural vision or whether neurons integrate information across fixations and predict the consequences of impending saccades. Answers are especially wanting for vision in complex scenes relevant to natural behavior. We let 13 monkeys freely view thousands of large natural images, recorded over 883 hours of neuronal responses throughout the ventral visual pathway across 4.7 million fixations, and designed flexible analyses to reveal the spatial, temporal, and feature selectivity of the responses. Ventral visual responses followed each fixation and did not become gaze-invariant as monkeys examined an image over seconds. Computational models revealed that neuronal responses corresponded to eye-centered receptive fields. The results suggest that ventral visual cortex remains predominantly retinotopic during natural vision and does not establish a gaze-independent representation of the world.

}, doi = {10.1101/2023.02.08.527666 }, url = {https://www.biorxiv.org/content/10.1101/2023.02.08.527666v1}, author = {Will Xiao and Saloni Sharma and Gabriel Kreiman and Margaret S. Livingstone} } @article {5448, title = {Perception of 3D shape integrates intuitive physics and analysis-by-synthesis}, journal = {Nature Human Behaviour}, year = {2023}, month = {11/2023}, abstract = {

Many surface cues support three-dimensional shape perception, but humans can sometimes still see shape when these features are missing{\textemdash}such as when an object is covered with a draped cloth. Here we propose a framework for three-dimensional shape perception that explains perception in both typical and atypical cases as analysis-by-synthesis, or inference in a generative model of image formation. The model integrates intuitive physics to explain how shape can be inferred from the deformations it causes to other objects, as in cloth draping. Behavioural and computational studies comparing this account with several alternatives show that it best matches human observers (total n = 174) in both accuracy and response times, and is the only model that correlates significantly with human performance on difficult discriminations. We suggest that bottom-up deep neural network models are not fully adequate accounts of human shape perception, and point to how machine vision systems might achieve more human-like robustness.

}, doi = {10.1038/s41562-023-01759-7}, url = {https://www.nature.com/articles/s41562-023-01759-7}, author = {Yildirim, Ilker and Siegel, Max H. and Soltani, Amir A. and Ray Chaudhuri, Shraman and Tenenbaum, Joshua B.} } @article {5317, title = {Physics informed machine learning for wind speed prediction}, journal = {Energy}, volume = {268}, year = {2023}, month = {04/2023}, pages = {126628}, abstract = {

The ability to predict wind is crucial for both energy production and weather forecasting. Mechanistic models that form the basis of traditional forecasting perform poorly near the ground. Here we take an alternative data-driven approach based on supervised learning. We analyze massive datasets of wind measured from anemometers located at 10\ m height in 32 locations in central and north-west Italy. We train supervised learning algorithms using the past history of wind to predict its value at future horizons. Using data from single locations and horizons, we compare systematically several algorithms where we vary the input/output variables, the memory and the linear vs non-linear model. We then compare performance of the best algorithms across all locations and forecasting horizons. We find that the optimal design as well as its performance change with the location. We demonstrate that the presence of a diurnal cycle provides a rationale to understand this variation. We conclude with a systematic comparison with state of the art algorithms. When focusing on publicly available datasets, our algorithm improves performance of 0.3\ m/s on average. In the aggregate, these comparisons show that, when the model is accurately designed, shallow algorithms are competitive with deep architectures.

}, issn = {03605442}, doi = {10.1016/j.energy.2023.126628}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0360544223000221}, author = {Lagomarsino-Oneto, Daniele and Meanti, Giacomo and Pagliana, Nicol{\`o} and Verri, Alessandro and Mazzino, Andrea and Rosasco, Lorenzo and Seminara, Agnese} } @article {5268, title = {Preliminary evidence for selective cortical responses to music in one-month-old infants}, journal = {Developmental Science}, year = {2023}, month = {03/2023}, abstract = {

Prior studies have observed selective neural responses in the adult human auditory cortex to music and speech that cannot be explained by the differing lower-level acoustic properties of these stimuli. Does infant cortex exhibit similarly selective responses to music and speech shortly after birth? To answer this question, we attempted to collect functional magnetic resonance imaging (fMRI) data from 45 sleeping infants (2.0- to 11.9-weeks-old) while they listened to monophonic instrumental lullabies and infant-directed speech produced by a mother. To match acoustic variation between music and speech sounds we (1) recorded music from instruments that had a similar spectral range as female infant-directed speech, (2) used a novel excitation-matching algorithm to match the cochleagrams of music and speech stimuli, and (3) synthesized {\textquotedblleft}model-matched{\textquotedblright} stimuli that were matched in spectrotemporal modulation statistics to (yet perceptually distinct from) music or speech. Of the 36 infants we collected usable data from, 19 had significant activations to sounds overall compared to scanner noise. From these infants, we observed a set of voxels in non-primary auditory cortex (NPAC) but not in Heschl{\textquoteright}s Gyrus that responded significantly more to music than to each of the other three stimulus types (but not significantly more strongly than to the background scanner noise). In contrast, our planned analyses did not reveal voxels in NPAC that responded more to speech than to model-matched speech, although other unplanned analyses did. These preliminary findings suggest that music selectivity arises within the first month of life.

}, keywords = {auditory cortex, fMRI, infants, music, speech}, issn = {1363-755X}, doi = {10.1111/desc.13387}, url = {https://onlinelibrary.wiley.com/doi/10.1111/desc.13387}, author = {Heather L Kosakowski and Norman-Haignere, Samuel and Mynick, Anna and Takahashi, Atsushi and Saxe, Rebecca and Nancy Kanwisher} } @article {5243, title = {SGD and Weight Decay Provably Induce a Low-Rank Bias in Deep Neural Networks}, year = {2023}, abstract = {

In this paper, we study the bias of Stochastic Gradient Descent (SGD) to learn low-rank weight matrices when training deep ReLU neural networks. Our results show that training neural networks with mini-batch SGD and weight decay causes a bias towards rank minimization over the weight matrices. Specifically, we show, both theoretically and empirically, that this bias is more pronounced when using smaller batch sizes, higher learning rates, or increased weight decay. Additionally, we predict and observe empirically that weight decay is necessary to achieve this bias. Finally, we empirically investigate the connection between this bias and generalization, finding that it has a marginal effect on generalization. Our analysis is based on a minimal set of assumptions and applies to neural networks of any width or depth, including those with residual connections and convolutional layers.

}, author = {Tomer Galanti and Zachary Siegel and Aparna Gupte and Tomaso Poggio} } @article {5360, title = {Skip Connections Increase the Capacity of Associative Memories in Variable Binding Mechanisms}, number = {142}, year = {2023}, month = {06/2023}, abstract = {

The flexibility of intelligent behavior is fundamentally attributed to the ability to separate and assign structural information from content in sensory inputs. Variable binding is the atomic computation that underlies this ability. In this work, we investigate the implementation of variable binding via pointers of assemblies of neurons, which are sets of excitatory neurons that fire together. The Assembly Calculus is a framework that describes a set of operations to create and modify assemblies of neurons. We focus on the project (which creates assemblies) and reciprocal-project (which performs vari- able binding) operations and study the capacity of networks in terms of the number of assemblies that can be reliably created and retrieved. We find that assembly calculus networks implemented through Hebbian plasticity resemble associative memories in their structure and behavior. However, for net- works with N neurons per brain area, the capacity of variable binding networks (0.01N) is an order of magnitude lower than the capacity of assembly creation networks (0.22N). To alleviate this drop in capacity, we propose a skip connection between the input and variable assembly, which boosts the capacity to a similar order of magnitude (0.1N ) as the Project operation, while maintain its biological plausibility.

}, author = {Yi Xie and Yichen Li and Akshay Rangamani} } @article {5254, title = {Sparse distributed memory is a continual learner}, year = {2023}, month = {03/2023}, address = {Kigali, Rwanda, Africa}, abstract = {

Continual learning is a problem for artificial neural networks that their biological counterparts are adept at solving. Building on work using Sparse Distributed Memory (SDM) to connect a core neural circuit with the powerful Transformer model, we create a modified Multi-Layered Perceptron (MLP) that is a strong continual learner. We find that every component of our MLP variant translated from biology is necessary for continual learning. Our solution is also free from any memory replay or task information, and introduces novel methods to train sparse networks that may be broadly applicable.

}, keywords = {Biologically Inspired, Continual Learning, Sparse Distributed Memory, Sparsity, Top-K Activation}, url = {https://openreview.net/forum?id=JknGeelZJpHP}, author = {Trenton Bricken and Xander Davies and Deepak Singh and Dmitry Krotov and Gabriel Kreiman} } @proceedings {5399, title = {System Identification of Neural Systems: If We Got It Right, Would We Know?}, volume = {202}, year = {2023}, month = {07/2023}, pages = {12430-12444}, abstract = {

Artificial neural networks are being proposed as models of parts of the brain. The networks are compared to recordings of biological neurons, and good performance in reproducing neural responses is considered to support the model{\textquoteright}s validity. A key question is how much this system identification approach tells us about brain computation. Does it validate one model architecture over another? We evaluate the most commonly used comparison techniques, such as a linear encoding model and centered kernel alignment, to correctly identify a model by replacing brain recordings with known ground truth models. System identification performance is quite variable; it also depends significantly on factors independent of the ground truth architecture, such as stimuli images. In addition, we show the limitations of using functional similarity scores in identifying higher-level architectural motifs.

}, url = {https://proceedings.mlr.press/v202/han23d.html}, author = {Yena Han and Tomaso A. Poggio and Brian Cheung} } @article {5320, title = {Using artificial neural networks to ask {\textquoteleft}why{\textquoteright} questions of minds and brains}, journal = {Trends in Neurosciences}, volume = {46}, year = {2023}, month = {03/2023}, pages = {240 - 254}, abstract = {

Neuroscientists have long characterized the properties and functions of the nervous system, and are increasingly succeeding in answering how brains perform the tasks they do. But the question {\textquoteleft}why{\textquoteright} brains work the way they do is asked less often. The new ability to optimize artificial neural networks (ANNs) for performance on human-like tasks now enables us to approach these {\textquoteleft}why{\textquoteright} questions by asking when the properties of networks optimized for a given task mirror the behavioral and neural characteristics of humans performing the same task. Here we highlight the recent success of this strategy in explaining why the visual and auditory systems work the way they do, at both behavioral and neural levels.

}, issn = {01662236}, doi = {10.1016/j.tins.2022.12.008}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0166223622002624}, author = {Kanwisher, Nancy and Khosla, Meenakshi and Dobs, Katharina} } @conference {5322, title = {Zero-shot linear combinations of grounded social interactions with Linear Social MDPs}, booktitle = {Proceedings of the 37th AAAI Conference on Artificial Intelligence (AAAI)}, year = {2023}, month = {02/2023}, abstract = {

Humans and animals engage in rich social interactions. It is often theorized that a relatively small number of basic social interactions give rise to the full range of behavior observed. But no computational theory explaining how social interactions combine together has been proposed before. We do so here. We take a model, the Social MDP, which is able to express a range of social interactions, and extend it to represent linear combinations of social interactions. Practically for robotics applications, such models are now able to not just express that an agent should help another agent, but to express goal-centric social interactions. Perhaps an agent is helping someone get dressed, but preventing them from falling, and is happy to exchange stories in the meantime. How an agent responds socially, should depend on what it thinks the other agent is doing at that point in time. To encode this notion, we take linear combinations of social interactions as defined in Social MDPs, and compute the weights on those combinations on the fly depending on the estimated goals of other agents. This new model, the Linear Social MDP, enables zero-shot reasoning about complex social interactions, provides a mathematical basis for the long-standing intuition that social interactions should compose, and leads to interesting new behaviors that we validate using human observers. Complex social interactions are part of the future of intelligent agents, and having principled mathematical models built on a foundation like MDPs will make it possible to bring social interactions to every robotic application.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @article {5283, title = {Adversarially trained neural representations may already be as robust as corresponding biological neural representations}, journal = {arXiv}, year = {2022}, month = {06/2022}, abstract = {

Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.

}, author = {Chong Guo and Michael J. Lee and Guillaume Leclerc and Joel Dapello and Yug Rao and Aleksander Madry and James J. DiCarlo} } @conference {5302, title = {The Aligned Multimodal Movie Treebank: An audio, video, dependency-parse treebank}, booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing}, year = {2022}, abstract = {

Treebanks have traditionally included only text and were derived from written sources such as newspapers or the web. We introduce the Aligned Multimodal Movie Treebank (AMMT), an English language treebank derived from dialog in Hollywood movies which includes transcriptions of the audio-visual streams with word-level alignment, as well as part of speech tags and dependency parses in the Universal Dependencies formalism. AMMT consists of 31,264 sentences and 218,090 words, that will amount to the 3rd largest UD English treebank and the only multimodal treebank in UD. To help with the web-based annotation effort, we also introduce the Efficient Audio Alignment Annotator (EAAA), a companion tool that enables annotators to significantly speed-up their annotation processes.

}, author = {Adam Yaari and Jan DeWitt and Henry Hu and Bennett Stankovits and Sue Felshin and Yevgeni Berzak and Helena Aparicio and Boris Katz and Ignacio Cases and Andrei Barbu} } @article {5284, title = {Aligning Model and Macaque Inferior Temporal Cortex Representations Improves Model-to-Human Behavioral Alignment and Adversarial Robustness}, journal = {bioRxiv}, year = {2022}, month = {07/2022}, abstract = {

While some state-of-the-art artificial neural network systems in computer vision are strikingly accurate models of the corresponding primate visual processing, there are still many discrepancies between these models and the behavior of primates on object recognition tasks. Many current models suffer from extreme sensitivity to adversarial attacks and often do not align well with the image-by-image behavioral error patterns observed in humans. Previous research has provided strong evidence that primate object recognition behavior can be very accurately predicted by neural population activity in the inferior temporal (IT) cortex, a brain area in the late stages of the visual processing hierarchy. Therefore, here we directly test whether making the late stage representations of models more similar to that of macaque IT produces new models that exhibit more robust, primate-like behavior. We conducted chronic, large-scale multi-electrode recordings across the IT cortex in six non-human primates (rhesus macaques). We then use these data to fine-tune (end-to-end) the model {\textquotedblleft}IT{\textquotedblright} representations such that they are more aligned with the biological IT representations, while preserving accuracy on object recognition tasks. We generate a cohort of models with a range of IT similarity scores validated on held-out animals across two image sets with distinct statistics. Across a battery of optimization conditions, we observed a strong correlation between the models{\textquoteright} IT-likeness and alignment with human behavior, as well as an increase in its adversarial robustness. We further assessed the limitations of this approach and find that the improvements in behavioral alignment and adversarial robustness generalize across different image statistics, but not to object categories outside of those covered in our IT training set. Taken together, our results demonstrate that building models that are more aligned with the primate brain leads to more robust and human-like behavior, and call for larger neural data-sets to further augment these gains.

}, author = {Joel Dapello and Kohitij Kar and Martin Schrimpf and Robert Geary and Michael Ferguson and David D. Cox and James J. DiCarlo} } @article {5134, title = {Animal-to-Animal Variability in Partial Hippocampal Remapping in Repeated Environments}, journal = {The Journal of Neuroscience}, volume = {42}, year = {2022}, month = {06/2022}, pages = {5268 - 5280}, abstract = {

Hippocampal place cells form a map of the environment of an animal. Changes in the hippocampal map can be brought about in a number of ways, including changes to the environment, task, internal state of the subject, and the passage of time. These changes in the hippocampal map have been called remapping. In this study, we examine remapping during repeated exposure to the same environment. Different animals can have different remapping responses to the same changes. This variability across animals in remapping behavior is not well understood. In this work, we analyzed electrophysiological recordings from the CA3 region of the hippocampus performed by Alme et al. (2014), in which five male rats were exposed to 11 different environments, including a variety of repetitions of those environments. To compare the hippocampal maps between two experiences, we computed average rate map correlation coefficients. We found changes in the hippocampal maps between different sessions in the same environment. These changes consisted of partial remapping, a form of remap- ping in which some place cells maintain their place fields, whereas other place cells remap their place fields. Each animal exhibited partial remapping differently. We discovered that the heterogeneity in hippocampal representational changes across animals is structured; individual animals had consistently different levels of partial remapping across a range of independent comparisons. Our findings highlight that partial hippocampal remapping between repeated environments depends on animal- specific factors.

}, keywords = {context, hippocampus, interindividual variability, overdispersion, place cell, remapping}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.3221-20.2022}, url = {https://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.3221-20.2022}, author = {Nilchian, Parsa and Matthew A. Wilson and Honi Sanders} } @article {5084, title = {An approximate representation of objects underlies physical reasoning}, journal = {psyArXiv}, year = {2022}, month = {03/2022}, abstract = {

People make fast and reasonable predictions about the physical behavior of everyday objects. To do so, people may be using principled approximations, similar to models developed by engineers for the purposes of real-time physical simulations. We hypothesize that people use simplified object approximations for tracking and action (the "body" representation), as opposed to fine-grained forms for recognition (the "shape" representation). We used three classic psychophysical tasks (causality perception, collision detection, and change detection) in novel settings that dissociate body and shape. People{\textquoteright}s behavior across tasks indicates that they rely on approximate bodies for physical reasoning, and that this approximation lies between convex hulls and fine-grained shapes.

}, url = {https://psyarxiv.com/vebu5/}, author = {Yichen Li and YingQiao Wang and Tal Boger and Kevin A Smith and Samuel J Gershman and Tomer Ullman} } @article {5285, title = {Artificial intelligence insights into hippocampal processing}, journal = {Frontiers in Computational Neuroscience}, volume = {16}, year = {2022}, month = {11/2022}, abstract = {

Advances in artificial intelligence, machine learning, and deep neural networks have led to new discoveries in human and animal learning and intelligence. A recent artificial intelligence agent in the DeepMind family, muZero, can complete a variety of tasks with limited information about the world in which it is operating and with high uncertainty about features of current and future space. To perform, muZero uses only three functions that are general yet specific enough to allow learning across a variety of tasks without overgeneralization across different contexts. Similarly, humans and animals are able to learn and improve in complex environments while transferring learning from other contexts and without overgeneralizing. In particular, the mammalian extrahippocampal system (eHPCS) can guide spatial decision making while simultaneously encoding and processing spatial and contextual information. Like muZero, the eHPCS is also able to adjust contextual representations depending on the degree and significance of environmental changes and environmental cues. In this opinion, we will argue that the muZero functions parallel those of the hippocampal system. We will show that the different components of the muZero model provide a framework for thinking about generalizable learning in the eHPCS, and that the evaluation of how transitions in cell representations occur between similar and distinct contexts can be informed by advances in artificial intelligence agents such as muZero. We additionally explain how advances in AI agents will provide frameworks and predictions by which to investigate the expected link between state changes and neuronal firing. Specifically, we will discuss testable predictions about the eHPCS, including the functions of replay and remapping, informed by the mechanisms behind muZero learning. We conclude with additional ways in which agents such as muZero can aid in illuminating prospective questions about neural functioning, as well as how these agents may shed light on potential expected answers.

}, doi = {10.3389/fncom.2022.1044659}, url = {https://www.frontiersin.org/articles/10.3389/fncom.2022.1044659/full}, author = {Wirtshafter, Hannah S. and Wilson, Matthew A.} } @article {5085, title = {Brain-like functional specialization emerges spontaneously in deep neural networks}, journal = {Science Advances}, volume = {8}, year = {2022}, month = {03/2023}, abstract = {

The human brain contains multiple regions with distinct, often highly specialized functions, from recognizing faces to understanding language to thinking about what others are thinking. However, it remains unclear why the cortex exhibits this high degree of functional specialization in the first place. Here, we consider the case of face perception using artificial neural networks to test the hypothesis that functional segregation of face recognition in the brain reflects a computational optimization for the broader problem of visual recognition of faces and other visual categories. We find that networks trained on object recognition perform poorly on face recognition and vice versa and that networks optimized for both tasks spontaneously segregate themselves into separate systems for faces and objects. We then show functional segregation to varying degrees for other visual categories, revealing a widespread tendency for optimization (without built-in task-specific inductive biases) to lead to functional specialization in machines and, we conjecture, also brains.

}, doi = {10.1126/sciadv.abl8913}, url = {https://www.science.org/doi/10.1126/sciadv.abl8913}, author = {Dobs, Katharina and Julio Martinez-Trujillo and Kell, Alexander J. E. and Nancy Kanwisher} } @article {5129, title = {A computational probe into the behavioral and neural markers of atypical facial emotion processing in autism}, journal = {The Journal of Neuroscience}, year = {2022}, month = {06/2022}, pages = {JN-RM-2229-21}, abstract = {

Despite ample behavioral evidence of atypical facial emotion processing in individuals with autism spectrum disorder (ASD), the neural underpinnings of such behavioral heterogeneities remain unclear. Here, I have used brain-tissue mapped artificial neural network (ANN) models of primate vision to probe candidate neural and behavior markers of atypical facial emotion recognition in ASD at an image-by-image level. Interestingly, the ANNs{\textquoteright} image-level behavioral patterns better matched the neurotypical subjects{\textquoteright} behavior than those measured in ASD. This behavioral mismatch was most remarkable when the ANN behavior was decoded from units that correspond to the primate inferior temporal (IT) cortex. ANN-IT responses also explained a significant fraction of the image-level behavioral predictivity associated with neural activity in the human amygdala (from epileptic patients without ASD){\textemdash} strongly suggesting that the previously reported facial emotion intensity encodes in the human amygdala could be primarily driven by projections from the IT cortex. In sum, these results identify primate IT activity as a candidate neural marker and demonstrate how ANN models of vision can be used to generate neural circuit-level hypotheses and guide future human and non-human primate studies in autism.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.2229-21.2022}, url = {https://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.2229-21.2022}, author = {Kar, Kohitij} } @article {5286, title = {Dangerous Ground: One-Year-Old Infants are Sensitive to Peril in Other Agents{\textquoteright} Action PlansAbstract}, journal = {Open Mind}, volume = {6}, year = {2022}, month = {10/2022}, pages = {211 - 231}, abstract = {

Do infants appreciate that other people{\textquoteright}s actions may fail, and that these failures endow risky actions with varying degrees of negative utility (i.e., danger)? Three experiments, including a pre-registered replication, addressed this question by presenting 12- to 15-month-old infants (N = 104, 52 female, majority White) with an animated agent who jumped over trenches of varying depth towards its goals. Infants expected the agent to minimize the danger of its actions, and they learned which goal the agent preferred by observing how much danger it risked to reach each goal, even though the agent{\textquoteright}s actions were physically identical and never failed. When we tested younger, 10-month-old infants (N = 102, 52 female, majority White) in a fourth experiment, they did not succeed consistently on the same tasks. These findings provide evidence that one-year-old infants use the height that other agents could fall from in order to explain and predict those agents{\textquoteright} actions.

}, keywords = {action understanding, agency, cognitive development, infancy, open data, open materials, pre-registered}, doi = {10.1162/opmi_a_00063}, url = {https://direct.mit.edu/opmi/article/doi/10.1162/opmi_a_00063/113342/Dangerous-Ground-One-Year-Old-Infants-are}, author = {Liu, Shari and Pepe, Bill and Ganesh Kumar, Manasa and Ullman, Tomer D. and Tenenbaum, Joshua B. and Spelke, Elizabeth S.} } @article {5014, title = {Deep neural network models of sound localization reveal how perception is adapted to real-world environments}, journal = {Nature Human Behavior}, volume = {6}, year = {2022}, month = {01/2022}, pages = {111{\textendash}133 }, chapter = {111}, abstract = {

Mammals localize sounds using information from their two ears. Localization in real-world conditions is challenging, as echoes provide erroneous information and noises mask parts of target sounds. To better understand real-world localization, we equipped a deep neural network with human ears and trained it to localize sounds in a virtual environment. The resulting model localized accurately in realistic conditions with noise and reverberation. In simulated experiments, the model exhibited many features of human spatial hearing: sensitivity to monaural spectral cues and interaural time and level differences, integration across frequency, biases for sound onsets and limits on localization of concurrent sources. But when trained in unnatural environments without reverberation, noise or natural sounds, these performance characteristics deviated from those of humans. The results show how biological hearing is adapted to the challenges of real-world environments and illustrate how artificial neural networks can reveal the real-world constraints that shape perception.

}, doi = {https://doi.org/10.1038/s41562-021-01244-z}, url = {https://www.nature.com/articles/s41562-021-01244-z}, author = {Andrew Francl and Josh H. McDermott} } @article {5033, title = {Do computational models of vision need shape-based representations? Evidence from an individual with intriguing visual perceptions}, journal = {Cognitive Neuropsychology}, year = {2022}, month = {02/2022}, pages = {1 - 3}, keywords = {computer vision models, intermediate representations, Ventral visual cortex, visual deficits}, issn = {0264-3294}, doi = {10.1080/02643294.2022.2041588}, url = {https://www.tandfonline.com/doi/full/10.1080/02643294.2022.2041588}, author = {Armendariz, Marcelo and Will Xiao and Vinken, Kasper and Gabriel Kreiman} } @article {5006, title = {Early concepts of intimacy: Young humans use saliva sharing to infer close relationships}, journal = {Science}, volume = {375}, year = {2022}, month = {01/2022}, pages = {311 - 315}, abstract = {

Across human societies, people form {\textquotedblleft}thick{\textquotedblright} relationships characterized by strong attachments, obligations, and mutual responsiveness. People in thick relationships share food utensils, kiss, or engage in other distinctive interactions that involve sharing saliva. We found that children, toddlers, and infants infer that dyads who share saliva (as opposed to other positive social interactions) have a distinct relationship. Children expect saliva sharing to happen in nuclear families. Toddlers and infants expect that people who share saliva will respond to one another in distress. Parents confirm that saliva sharing is a valid cue of relationship thickness in their children{\textquoteright}s social environments. The ability to use distinctive interactions to infer categories of relationships thus emerges early in life, without explicit teaching; this enables young humans to rapidly identify close relationships, both within and beyond families.

}, issn = {0036-8075}, doi = {10.1126/science.abh1054}, url = {https://www.science.org/doi/10.1126/science.abh1054}, author = {Thomas, Ashley J. and Woo, Brandon and Nettle, Daniel and Elizabeth S Spelke and Rebecca Saxe} } @article {5035, title = {On the Efficacy of Co-Attention Transformer Layers in Visual Question Answering}, journal = {arXiv}, year = {2022}, month = {01/2022}, abstract = {

In recent years, multi-modal transformers have shown significant progress in Vision-Language tasks, such as Visual Question Answering (VQA), outperforming previous architectures by a considerable margin. This improvement in VQA is often attributed to the rich interactions between vision and language streams. In this work, we investigate the efficacy of co-attention transformer layers in helping the network focus on relevant regions while answering the question. We generate visual attention maps using the question-conditioned image attention scores in these co-attention layers. We evaluate the effect of the following critical components on visual attention of a state-of-the-art VQA model: (i) number of object region proposals, (ii) question part of speech (POS) tags, (iii) question semantics, (iv) number of co-attention layers, and (v) answer accuracy. We compare the neural network attention maps against human attention maps both qualitatively and quantitatively. Our findings indicate that co-attention transformer modules are crucial in attending to relevant regions of the image given a question. Importantly, we observe that the semantic meaning of the question is not what drives visual attention, but specific keywords in the question do. Our work sheds light on the function and interpretation of co-attention transformer layers, highlights gaps in current networks, and can guide the development of future VQA models and networks that simultaneously process visual and language streams.

}, doi = {10.48550/arXiv.2201.03965}, url = {https://arxiv.org/abs/2201.03965}, author = {Ankur Sikarwar and Gabriel Kreiman} } @article {5287, title = {Eight-Month-Old Infants{\textquoteright} Social Evaluations of Agents Who Act on False Beliefs}, journal = {Proceedings of the Annual Meeting of the Cognitive Science Society}, volume = {44}, year = {2022}, abstract = {

Do infants{\textquoteright} social evaluations privilege the outcomes of others{\textquoteright} actions, or the beliefs underlying those actions? In two experiments, 8-month-old infants viewed a protagonist who sought to grasp one of two toys, each inside a different box, as two other agents observed. Then, while the protagonist was away, the toys exchanged locations, either in the presence or absence of the two other agents. Thus, the agents had either true or false beliefs about the toys{\textquoteright} locations. When the protagonist returned, one agent opened the box that now contained the protagonist{\textquoteright}s desired toy, whereas the other opened the box that previously contained that toy. When agents had true beliefs about the desired toy{\textquoteright}s location, infants preferred the agent who opened the box containing that toy. When agents had false beliefs about that location, infants instead preferred the agent who opened the opposite box. Thus, infants{\textquoteright} social evaluations privilege agents{\textquoteright} beliefs.

}, url = {https://escholarship.org/uc/item/8k02x1mx}, author = {Woo, Brandon and Elizabeth Spelke} } @proceedings {5140, title = {Error-driven Input Modulation: Solving the Credit Assignment Problem without a Backward Pass}, volume = {162}, year = {2022}, month = {07/2022}, pages = {4937-4955}, abstract = {

Supervised learning in artificial neural networks typically relies on backpropagation, where the weights are updated based on the error-function gradients and sequentially propagated from the output layer to the input layer. Although this approach has proven effective in a wide domain of applications, it lacks biological plausibility in many regards, including the weight symmetry problem, the dependence of learning on non-local signals, the freezing of neural activity during error propagation, and the update locking problem. Alternative training schemes have been introduced, including sign symmetry, feedback alignment, and direct feedback alignment, but they invariably rely on a backward pass that hinders the possibility of solving all the issues simultaneously. Here, we propose to replace the backward pass with a second forward pass in which the input signal is modulated based on the error of the network. We show that this novel learning rule comprehensively addresses all the above-mentioned issues and can be applied to both fully connected and convolutional models. We test this learning rule on MNIST, CIFAR-10, and CIFAR-100. These results help incorporate biological principles into machine learning.

}, url = {https://proceedings.mlr.press/v162/dellaferrera22a.html}, author = {Giorgia Dellaferrera and Gabriel Kreiman} } @article {5086, title = {Eszopiclone and Zolpidem Produce Opposite Effects on Hippocampal Ripple DensityDataSheet1.docx}, journal = {Frontiers in Pharmacology}, volume = {12}, year = {2022}, month = {01/2022}, abstract = {

Clinical populations have memory deficits linked to sleep oscillations that can potentially be treated with sleep medications. Eszopiclone and zolpidem (two non-benzodiazepine hypnotics) both enhance sleep spindles. Zolpidem improved sleep-dependent memory consolidation in humans, but eszopiclone did not. These divergent results may reflect that the two drugs have different effects on hippocampal ripple oscillations, which correspond to the reactivation of neuronal ensembles that represent previous waking activity and contribute to memory consolidation. We used extracellular recordings in the CA1 region of rats and systemic dosing of eszopiclone and zolpidem to test the hypothesis that these two drugs differentially affect hippocampal ripples and spike activity. We report evidence that eszopiclone makes ripples sparser, while zolpidem increases ripple density. In addition, eszopiclone led to a drastic decrease in spike firing, both in putative pyramidal cells and interneurons, while zolpidem did not substantially alter spiking. These results provide an explanation of the different effects of eszopiclone and zolpidem on memory in human studies and suggest that sleep medications can be used to regulate hippocampal ripple oscillations, which are causally linked to sleep-dependent memory consolidation.

}, doi = {10.3389/fphar.2021.792148}, url = {https://www.frontiersin.org/articles/10.3389/fphar.2021.792148/full}, author = {Becker, Logan A. and Hector Penagos and Francisco J. Flores and Manoach, Dara S. and Matthew A. Wilson and Varela, Carmen} } @article {5303, title = {The evolution of color naming reflects pressure for efficiency: Evidence from the recent pastAbstract}, journal = {Journal of Language Evolution}, year = {2022}, month = {04/2022}, abstract = {

It has been proposed that semantic systems evolve under pressure for efficiency. This hypothesis has so far been supported largely indirectly, by synchronic cross-language comparison, rather than directly by diachronic data. Here, we directly test this hypothesis in the domain of color naming, by analyzing recent diachronic data from Nafaanra, a language of Ghana and C{\^o}te d{\textquoteright}Ivoire, and comparing it with quantitative predictions derived from the mathematical theory of efficient data compression. We show that color naming in Nafaanra has changed over the past four decades while remaining near-optimally efficient, and that this outcome would be unlikely under a random drift process that maintains structured color categories without pressure for efficiency. To our knowledge, this finding provides the first direct evidence that color naming evolves under pressure for efficiency, supporting the hypothesis that efficiency shapes the evolution of the lexicon.

}, issn = {2058-4571}, doi = {10.1093/jole/lzac001}, url = {https://academic.oup.com/jole/advance-article/doi/10.1093/jole/lzac001/6566271}, author = {Zaslavsky, Noga and Garvin, Karee and Kemp, Charles and Tishby, Naftali and Regier, Terry} } @article {5096, title = {Face neurons encode nonsemantic features}, journal = {Proceedings of the National Academy of Sciences}, volume = {119}, year = {2022}, month = {02/2022}, abstract = {

The primate inferior temporal cortex contains neurons that respond more strongly to faces than to other objects. Termed {\textquotedblleft}face neurons,{\textquotedblright} these neurons are thought to be selective for faces as a semantic category. However, face neurons also partly respond to clocks, fruits, and single eyes, raising the question of whether face neurons are better described as selective for visual features related to faces but dissociable from them. We used a recently described algorithm, XDream, to evolve stimuli that strongly activated face neurons. XDream leverages a generative neural network that is not limited to realistic objects. Human participants assessed images evolved for face neurons and for nonface neurons and natural images depicting faces, cars, fruits, etc. Evolved images were consistently judged to be distinct from real faces. Images evolved for face neurons were rated as slightly more similar to faces than images evolved for nonface neurons. There was a correlation among natural images between face neuron activity and subjective {\textquotedblleft}faceness{\textquotedblright} ratings, but this relationship did not hold for face neuron{\textendash}evolved images, which triggered high activity but were rated low in faceness. Our results suggest that so-called face neurons are better described as tuned to visual features rather than semantic categories.

}, issn = {0027-8424}, doi = {10.1073/pnas.2118705119}, url = {https://pnas.org/doi/full/10.1073/pnas.2118705119}, author = {Bardon, Alexandra and Will Xiao and Carlos R Ponce and Margaret S Livingstone and Gabriel Kreiman} } @proceedings {5105, title = {Finding Biological Plausibility for Adversarially Robust Features via Metameric Tasks}, year = {2022}, keywords = {Adversarial Robustness, Metamerism, Perceptual Invariance, Peripheral Computation, Psychophysics, Texture}, url = {https://openreview.net/forum?id=yeP_zx9vqNm}, author = {Anne Harrington and Arturo Deza} } @article {5288, title = {Genome-wide mapping of somatic mutation rates uncovers drivers of cancerAbstract}, journal = {Nature Biotechnology}, volume = {40}, year = {2022}, month = {06/2022}, pages = {1634 - 1643}, abstract = {

Identification of cancer driver mutations that confer a proliferative advantage is central to understanding cancer; however, searches have often been limited to protein-coding sequences and specific non-coding elements (for example, promoters) because of the challenge of modeling the highly variable somatic mutation rates observed across tumor genomes. Here we present Dig, a method to search for driver elements and mutations anywhere in the genome. We use deep neural networks to map cancer-specific mutation rates genome-wide at kilobase-scale resolution. These estimates are then refined to search for evidence of driver mutations under positive selection throughout the genome by comparing observed to expected mutation counts. We mapped mutation rates for 37 cancer types and applied these maps to identify putative drivers within intronic cryptic splice regions, 5' untranslated regions and infrequently mutated genes. Our high-resolution mutation rate maps, available for web-based exploration, are a resource to enable driver discovery genome-wide.

}, issn = {1087-0156}, doi = {10.1038/s41587-022-01353-8}, url = {https://www.nature.com/articles/s41587-022-01353-8}, author = {Sherman, Maxwell A. and Yaari, Adam U. and Priebe, Oliver and Dietlein, Felix and Loh, Po-Ru and Berger, Bonnie} } @article {5087, title = {Harmonicity aids hearing in noise}, journal = {Attention, Perception, \& Psychophysics}, year = {2022}, month = {01/2022}, abstract = {

Hearing in noise is a core problem in audition, and a challenge for hearing-impaired listeners, yet the underlying mechanisms are poorly understood. We explored whether harmonic frequency relations, a signature property of many communication sounds, aid hearing in noise for normal hearing listeners. We measured detection thresholds in noise for tones and speech synthesized to have harmonic or inharmonic spectra. Harmonic signals were consistently easier to detect than otherwise identical inharmonic signals. Harmonicity also improved discrimination of sounds in noise. The largest benefits were observed for two-note up-down {\textquotedblleft}pitch{\textquotedblright} discrimination and melodic contour discrimination, both of which could be performed equally well with harmonic and inharmonic tones in quiet, but which showed large harmonic advantages in noise. The results show that harmonicity facilitates hearing in noise, plausibly by providing a noise-robust pitch cue that aids detection and discrimination.

}, issn = {1943-3921}, doi = {10.3758/s13414-021-02376-0}, url = {https://link.springer.com/10.3758/s13414-021-02376-0}, author = {McPherson, Malinda J. and Grace, River C. and Josh H. McDermott} } @article {5281, title = {A highly selective response to food in human visual cortex revealed by hypothesis-free voxel decomposition}, journal = {Current Biology}, volume = {32}, year = {2022}, month = {10/2022}, pages = {4159 - 4171.e9}, abstract = {

Prior work has identified cortical regions selectively responsive to specific categories of visual stimuli. However, this hypothesis-driven work cannot reveal how prominent these category selectivities are in the overall functional organization of the visual cortex, or what others might exist that scientists have not thought to look for. Furthermore, standard voxel-wise tests cannot detect distinct neural selectivities that coexist within voxels. To overcome these limitations, we used data-driven voxel decomposition methods to identify the main components underlying fMRI responses to thousands of complex photographic images. Our hypothesis-neutral analysis rediscovered components selective for faces, places, bodies, and words, validating our method and showing that these selectivities are dominant features of the ventral visual pathway. The analysis also revealed an unexpected component with a distinct anatomical distribution that responded highly selectively to images of food. Alternative accounts based on low- to mid-level visual features, such as color, shape, or texture, failed to account for the food selectivity of this component. High-throughput testing and control experiments with matched stimuli on a highly accurate computational model of this component confirm its selectivity for food. We registered our methods and hypotheses before replicating them on held-out participants and in a novel dataset. These findings demonstrate the power of data-driven methods and show that the dominant neural responses of the ventral visual pathway include not only selectivities for faces, scenes, bodies, and words but also the visually heterogeneous category of food, thus constraining accounts of when and why functional specialization arises in the cortex.

}, issn = {09609822}, doi = {10.1016/j.cub.2022.08.009}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0960982222012866}, author = {Khosla, Meenakshi and Ratan Murty, N. Apurva and Kanwisher, Nancy} } @article {5177, title = {How Deep Sparse Networks Avoid the Curse of Dimensionality: Efficiently Computable Functions are Compositionally Sparse}, year = {2022}, month = {10/2022}, abstract = {

The main claim of this perspective is that compositional sparsity of the target function, which corresponds\  to the task to be learned, is the key principle underlying machine learning. Mhaskar and\  Poggio (2016) proved that sparsity of the compositional target functions (when the functions are on\  the reals, the constituent functions are also required to be smooth), naturally leads to sparse deep\  networks for approximation and thus for optimization. This is the case of most CNNs in current use,\  in which the known sparse graph of the target function is reflected in the sparse connectivity of the\  network. When the graph of the target function is unknown, I conjecture that transformers are able to\  implement a flexible version of sparsity (selecting which input tokens interact in the MLP layer), through\  the self-attention layers.\  Surprisingly, the assumption of compositional sparsity of the target function is not restrictive in practice,\  since I prove here that for computable functions (if on the reals with Lipschitz continuous derivatives)\  compositional sparsity is equivalent to efficient computability, that is Turing computability in polynomial\  time.

}, author = {Tomaso A. Poggio} } @article {5089, title = {On the Implicit Bias Towards Minimal Depth of Deep Neural Networks}, journal = {arXiv}, year = {2022}, month = {03/2022}, abstract = {

We study the implicit bias of gradient based training methods to favor low-depth solutions when training deep neural networks. Recent results in the literature suggest that penultimate layer representations learned by a classifier over multiple classes exhibit a clustering property, called neural collapse. We demonstrate empirically that neural collapse extends beyond the penultimate layer and emerges in intermediate layers as well. In this regards, we hypothesize and empirically show that gradient based methods are implicitly biased towards selecting neural networks of minimal depth for achieving this clustering property.

}, url = {https://arxiv.org/abs/2202.09028}, author = {Tomer Galanti and Liane Galanti} } @article {5050, title = {Incorporating Rich Social Interactions Into MDPs}, year = {2022}, abstract = {

Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microso- ciology and economics can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other{\textquoteright}s hidden rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions zero-shot in new environments; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the judgments of these Social MDPs align closely with those of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics, MDPs.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Bennett Stankovits and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz and Andrei Barbu} } @conference {5291, title = {Incorporating Rich Social Interactions Into MDPs}, booktitle = {2022 IEEE International Conference on Robotics and Automation (ICRA)2022 International Conference on Robotics and Automation (ICRA)}, year = {2022}, month = {05/2022}, address = {Philadelphia, PA, USA}, abstract = {

Much of what we do as humans is engage socially with other agents, a skill that robots must also eventually possess. We demonstrate that a rich theory of social interactions originating from microsociology can be formalized by extending a nested MDP where agents reason about arbitrary functions of each other\&$\#$x0027;s rewards. This extended Social MDP allows us to encode the five basic interactions that underlie microsociology: cooperation, conflict, coercion, competition, and exchange. The result is a robotic agent capable of executing social interactions in new environments with no interaction-specific training; like humans it can engage socially in novel ways even without a single example of that social interaction. Moreover, the estimations of these Social MDPs align closely with the judge-ments of humans when considering which social interaction is taking place in an environment. This method both sheds light on the nature of social interactions, by providing concrete mathematical definitions, and brings rich social interactions into a mathematical framework that has proven to be natural for robotics.

}, doi = {10.1109/ICRA46639.2022.9811991}, url = {https://ieeexplore.ieee.org/document/9811991/}, author = {Tejwani, Ravi and Kuo, Yen-Ling and Shu, Tianmin and Stankovits, Bennett and Gutfreund, Dan and Tenenbaum, Joshua B. and Katz, Boris and Barbu, Andrei} } @article {5292, title = {Inductive biases in theory-based reinforcement learning}, journal = {Cognitive Psychology}, volume = {138}, year = {2022}, month = {11/2022}, pages = {101509}, abstract = {

Understanding the inductive biases that allow humans to learn in complex environments has been an important goal of cognitive science. Yet, while we have discovered much about human biases in specific learning domains, much of this research has focused on simple tasks that lack the complexity of the real world. In contrast, video games involving agents and objects embedded in richly structured systems provide an experimentally tractable proxy for real-world complexity. Recent work has suggested that key aspects of human learning in domains like video games can be captured by model-based reinforcement learning (RL) with object-oriented relational models{\textemdash}what we term theory-based RL. Restricting the model class in this way provides an inductive bias that dramatically increases learning efficiency, but in this paper we show that humans employ a stronger set of biases in addition to syntactic constraints on the structure of theories. In particular, we catalog a set of semantic biases that constrain the content of theories. Building these semantic biases into a theory-based RL system produces more human-like learning in video game environments.

}, issn = {00100285}, doi = {10.1016/j.cogpsych.2022.101509}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010028522000457}, author = {Pouncy, Thomas and Gershman, Samuel J.} } @article {5225, title = {Infants infer potential social partners by observing the interactions of their parent with unknown others}, journal = {Proceedings of the National Academy of Sciences}, volume = {119}, year = {2022}, month = {05/2022}, abstract = {

Significance

Despite decades of research on the development of social knowledge, few experiments have tested how infants learn about new individuals from the behavior of their caregivers. Here, we show that infants learn who is a potential social partner by observing their parents{\textquoteright} interactions with previously unknown individuals.

Abstract

Infants are born into networks of individuals who are socially connected. How do infants begin learning which individuals are their own potential social partners? Using digitally edited videos, we showed 12-mo-old infants{\textquoteright} social interactions between unknown individuals and their own parents. In studies 1 to 4, after their parent showed affiliation toward one puppet, infants expected that puppet to engage with them. In study 5, infants made the reverse inference; after a puppet engaged with them, the infants expected that puppet to respond to their parent. In each study, infants{\textquoteright} inferences were specific to social interactions that involved their own parent as opposed to another infant{\textquoteright}s parent. Thus, infants combine observation of social interactions with knowledge of their preexisting relationship with their parent to discover which newly encountered individuals are potential social partners for themselves and their families.

}, issn = {0027-8424}, doi = {10.1073/pnas.2121390119}, url = {https://pnas.org/doi/full/10.1073/pnas.2121390119}, author = {Thomas, Ashley J. and Saxe, Rebecca and Spelke, Elizabeth S.} } @article {5282, title = {Invariant representation of physical stability in the human brain}, journal = {eLife}, volume = {11}, year = {2022}, month = {05/2024}, abstract = {

Successful engagement with the world requires the ability to predict what will happen next. Here, we investigate how the brain makes a fundamental prediction about the physical world: whether the situation in front of us is stable, and hence likely to stay the same, or unstable, and hence likely to change in the immediate future. Specifically, we ask if judgments of stability can be supported by the kinds of representations that have proven to be highly effective at visual object recognition in both machines and brains, or instead if the ability to determine the physical stability of natural scenes may require generative algorithms that simulate the physics of the world. To find out, we measured responses in both convolutional neural networks (CNNs) and the brain (using fMRI) to natural images of physically stable versus unstable scenarios. We find no evidence for generalizable representations of physical stability in either standard CNNs trained on visual object and scene classification (ImageNet), or in the human ventral visual pathway, which has long been implicated in the same process. However, in frontoparietal regions previously implicated in intuitive physical reasoning we find both scenario-invariant representations of physical stability, and higher univariate responses to unstable than stable scenes. These results demonstrate abstract representations of physical stability in the dorsal but not ventral pathway, consistent with the hypothesis that the computations underlying stability entail not just pattern classification but forward physical simulation.

}, doi = {10.7554/eLife.71736}, url = {https://elifesciences.org/articles/71736}, author = {Pramod, RT and Cohen, Michael A and Tenenbaum, Joshua B and Kanwisher, Nancy} } @conference {5106, title = {Joint rotational invariance and adversarial training of a dual-stream Transformer yields state of the art Brain-Score for Area V4}, booktitle = {BrainScore Workshop at COSYNE}, year = {2022}, keywords = {adversarial training, Brain-Score competition, rotation invariance, Vision Transformer}, url = {https://openreview.net/pdf?id=SOulrWP-Xb5}, author = {William Berrios and Arturo Deza} } @article {5295, title = {Learning new physics efficiently with nonparametric methodsAbstract}, journal = {The European Physical Journal C}, volume = {82}, year = {2022}, month = {10/2022}, abstract = {

We present a machine learning approach for model-independent new physics searches. The corresponding algorithm is powered by recent large-scale implementations of kernel methods, nonparametric learning algorithms that can approximate any continuous function given enough data. Based on the original proposal by D{\textquoteright}Agnolo and Wulzer (Phys Rev D 99(1):015014, 2019, arXiv:1806.02350 [hep-ph]), the model evaluates the compatibility between experimental data and a reference model, by implementing a hypothesis testing procedure based on the likelihood ratio. Model-independence is enforced by avoiding any prior assumption about the presence or shape of new physics components in the measurements. We show that our approach has dramatic advantages compared to neural network implementations in terms of training times and computational resources, while maintaining comparable performances. In particular, we conduct our tests on higher dimensional datasets, a step forward with respect to previous studies.

}, doi = {10.1140/epjc/s10052-022-10830-y}, url = {https://link.springer.com/10.1140/epjc/s10052-022-10830-y}, author = {Letizia, Marco and Losapio, Gianvito and Rando, Marco and Grosso, Gaia and Wulzer, Andrea and Pierini, Maurizio and Zanetti, Marco and Rosasco, Lorenzo} } @book {5289, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2022Image2Point: 3D Point-Cloud Understanding with~2D Image Pretrained Models}, volume = {13697}, year = {2022}, month = {10/2022}, pages = {638 - 656}, publisher = {Springer Nature Switzerland}, organization = {Springer Nature Switzerland}, address = {Cham}, abstract = {

3D point-clouds and 2D images are different visual representations of the physical world. While human vision can understand both representations, computer vision models designed for 2D image and 3D point-cloud understanding are quite different. Our paper explores the potential of transferring 2D model architectures and weights to understand 3D point-clouds, by empirically investigating the feasibility of the transfer, the benefits of the transfer, and shedding light on why the transfer works. We discover that we can indeed use the same architecture and pretrained weights of a neural net model to understand both images and point-clouds. Specifically, we transfer the image-pretrained model to a point-cloud model by copying or inflating the weights. We find that finetuning the transformed image-pretrained models (FIP) with minimal efforts{\textemdash}only on input, output, and normalization layers{\textemdash}can achieve competitive performance on 3D point-cloud classification, beating a wide range of point-cloud models that adopt task-specific architectures and use a variety of tricks. When finetuning the whole model, the performance gets further improved. Meanwhile, FIP improves data efficiency, reaching up to 10.0 top-1 accuracy percent on few-shot classification. It also speeds up the training of point-cloud models by up to 11.1x for a target accuracy (e.g., 90\% accuracy). Lastly, we provide an explanation of the image to point-cloud transfer from the aspect of neural collapse. The code is available at: https://github.com/chenfengxu714/image2point.

}, isbn = {978-3-031-19835-9}, issn = {0302-9743}, doi = {10.1007/978-3-031-19836-610.1007/978-3-031-19836-6_36}, url = {https://link.springer.com/10.1007/978-3-031-19836-6}, author = {Xu, Chenfeng and Yang, Shijia and Galanti, Tomer and Wu, Bichen and Yue, Xiangyu and Zhai, Bohan and Zhan, Wei and Vajda, Peter and Keutzer, Kurt and Tomizuka, Masayoshi}, editor = {Avidan, Shai and Brostow, Gabriel and Ciss{\'e}, Moustapha and Farinella, Giovanni Maria and Hassner, Tal} } @article {5258, title = {Look twice: A generalist computational model predicts return fixations across tasks and species}, journal = {PLOS Computational Biology}, volume = {18}, year = {2022}, month = {11/2022}, pages = {e1010654}, abstract = {

Primates constantly explore their surroundings via saccadic eye movements that bring different parts of an image into high resolution. In addition to exploring new regions in the visual field, primates also make frequent return fixations, revisiting previously foveated locations. We systematically studied a total of 44,328 return fixations out of 217,440 fixations. Return fixations were ubiquitous across different behavioral tasks, in monkeys and humans, both when subjects viewed static images and when subjects performed natural behaviors. Return fixations locations were consistent across subjects, tended to occur within short temporal offsets, and typically followed a 180-degree turn in saccadic direction. To understand the origin of return fixations, we propose a proof-of-principle, biologically-inspired and image-computable neural network model. The model combines five key modules: an image feature extractor, bottom-up saliency cues, task-relevant visual features, finite inhibition-of-return, and saccade size constraints. Even though there are no free parameters that are fine-tuned for each specific task, species, or condition, the model produces fixation sequences resembling the universal properties of return fixations. These results provide initial steps towards a mechanistic understanding of the trade-off between rapid foveal recognition and the need to scrutinize previous fixation locations.

}, doi = {10.1371/journal.pcbi.1010654}, url = {https://dx.plos.org/10.1371/journal.pcbi.1010654}, author = {Zhang, Mengmi and Armendariz, Marcelo and Xiao, Will and Rose, Olivia and Bendtz, Katarina and Livingstone, Margaret and Ponce, Carlos and Kreiman, Gabriel}, editor = {Faisal, Aldo A.} } @article {5296, title = {Mental Jenga: A counterfactual simulation model of causal judgments about physical support}, journal = {PsyArXiv}, year = {2022}, month = {02/2022}, abstract = {

From building towers to picking an orange from a stack of fruit, assessing support is critical for successfully interacting with the physical world. But how do people determine whether one object supports another? In this paper, we develop the Counterfactual Simulation Model (CSM) of causal judgments about physical support. The CSM predicts that people judge physical support by mentally simulating what would happen to a scene if the object of interest were removed. Three experiments test the model by asking one group of participants to judge what would happen to a tower if one of the blocks were removed, and another group of participants how responsible that block was for the tower{\textquoteright}s stability. The CSM accurately captures participants{\textquoteright} predictions by running noisy simulations that incorporate different sources of uncertainty. Participants{\textquoteright} responsibility judgments are closely related to counterfactual predictions: a block is more responsible when many other blocks would fall if it were removed. By construing physical support as preventing from falling, the CSM provides a unified account of how causal judgments in dynamic and static physical scenes arise from the process of counterfactual simulation.

}, url = {https://psyarxiv.com/4a5uh}, author = {Liang Zhou and Kevin Smith and Joshua B. Tenenbaum and Tobias Gerstenberg} } @conference {5088, title = {Neural Collapse in Deep Homogeneous Classifiers and the role of Weight Decay}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2022}, month = {05/2022}, address = {Singapore}, author = {Andrzej Banburski and Akshay Rangamani} } @article {5027, title = {A neural population selective for song in human auditory cortex}, journal = {Current Biology}, year = {2022}, month = {02/2022}, abstract = {

How is music represented in the brain? While neuroimaging has revealed some spatial segregation between responses to music versus other sounds, little is known about the neural code for music itself. To address this question, we developed a method to infer canonical response components of human auditory cortex using intracranial responses to natural sounds, and further used the superior coverage of fMRI to map their spatial distribution. The inferred components replicated many prior findings, including distinct neural selectivity for speech and music, but also revealed a novel component that responded nearly exclusively to music with singing. Song selectivity was not explainable by standard acoustic features, was located near speech and music-selective responses, and was also evident in individual electrodes. These results suggest that representations of music are fractionated into subpopulations selective for different types of music, one of which is specialized for the analysis of song.

}, issn = {09609822}, doi = {10.1016/j.cub.2022.01.069}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0960982222001312}, author = {Norman-Haignere, Sam V. and Jenelle Feather and Boebinger, Dana and Brunner, Peter and Ritaccio, Anthony and Josh H. McDermott and Schalk, Gerwin and Nancy Kanwisher} } @article {5297, title = {NeuroDecodeR: A package for neural decoding analyses in R}, journal = {bioRxiv}, year = {2022}, month = {12/2022}, abstract = {

Neural decoding is a powerful method to analyze neural activity. However, the code needed to run a decoding analysis can be complex, which can present a barrier to using the method. In this paper we introduce a package that makes it easy to perform decoding analyses in the R programing language. We describe how the package is designed in a modular fashion which allows researchers to easily implement a range of different analyses. We also discuss how to format data to be able to use the package, and we give two examples of how to use the package to analyze real data. We believe that this package, combined with the rich data analysis ecosystem in R, will make it significantly easier for researchers to create reproducible decoding analyses, which should help increase the pace of neuroscience discoveries.

}, url = {https://www.biorxiv.org/content/10.1101/2022.12.17.520811v1}, author = {Meyers, Ethan M.} } @article {5031, title = {Neurons detect cognitive boundaries to structure episodic memories in humans}, journal = {Nature Neuroscience}, volume = {25}, year = {2022}, month = {03/2022}, pages = {358 - 368}, abstract = {

While experience is continuous, memories are organized as discrete events. Cognitive boundaries are thought to segment experience and structure memory, but how this process is implemented remains unclear. We recorded the activity of single neurons in the human medial temporal lobe (MTL) during the formation and retrieval of memories with complex narratives. Here, we show that neurons responded to abstract cognitive boundaries between different episodes. Boundary-induced neural state changes during encoding predicted subsequent recognition accuracy but impaired event order memory, mirroring a fundamental behavioral tradeoff between content and time memory. Furthermore, the neural state following boundaries was reinstated during both successful retrieval and false memories. These findings reveal a neuronal substrate for detecting cognitive boundaries that transform experience into mnemonic episodes and structure mental time travel during retrieval.

}, issn = {1097-6256}, doi = {10.1038/s41593-022-01020-w}, url = {https://www.nature.com/articles/s41593-022-01020-w}, author = {Zheng, Jie and Schjetnan, Andrea G. P. and Yebra, Mar and Gomes, Bernard A. and Mosher, Clayton P. and Kalia, Suneil K. and Valiante, Taufik A. and Mamelak, Adam N. and Gabriel Kreiman and Rutishauser, Ueli} } @article {5039, title = {One thing to fool them all: generating interpretable, universal, and physically-realizable adversarial features}, journal = {arXiv}, year = {2022}, month = {01/2022}, abstract = {

It is well understood that modern deep networks are vulnerable to adversarial attacks. However, conventional attack methods fail to produce adversarial perturbations that are intelligible to humans, and they pose limited threats in the physical world. To study feature-class associations in networks and better understand their vulnerability to attacks in the real world, we develop feature-level adversarial perturbations using deep image generators and a novel optimization objective. We term these feature-fool attacks. We show that they are versatile and use them to generate targeted feature-level attacks at the ImageNet scale that are simultaneously interpretable, universal to any source image, and physically-realizable. These attacks reveal spurious, semantically-describable feature/class associations that can be exploited by novel combinations of objects. We use them to guide the design of {\textquotedblleft}copy/paste{\textquotedblright} adversaries in which one natural image is pasted into another to cause a targeted misclassification.

}, doi = {10.48550/arXiv.2110.03605}, url = {https://arxiv.org/abs/2110.03605}, author = {Stephen Casper and Max Nadeau and Gabriel Kreiman} } @article {5092, title = {PCA as a defense against some adversaries}, year = {2022}, abstract = {

Neural network classifiers are known to be highly vulnerable to adversarial perturbations in their inputs. Under the hypothesis that adversarial examples lie outside of the sub-manifold of natural images, previous work has investigated the impact of principal components in data on adversarial robustness. In this paper we show that there exists a very simple defense mechanism in the case where adversarial images are separable in a previously defined $(k,p)$ metric. This defense is very successful against the\  popular Carlini-Wagner attack, but less so against some other common attacks like FGSM. It is interesting to note that the defense is still successful for relatively large perturbations.

}, author = {Aparna Gupte and Andrzej Banburski and Tomaso Poggio} } @conference {5298, title = {Primate Inferotemporal Cortex Neurons Generalize Better to Novel Image Distributions Than Analogous Deep Neural Networks Units}, booktitle = {NeurIPS}, year = {2022}, month = {10/2022}, abstract = {

Humans are successfully able to recognize objects in a variety of image distributions. Today{\textquoteright}s artificial neural networks (ANNs), on the other hand, struggle to recognize objects in many image domains, especially those different from the training distribution. It is currently unclear which parts of the ANNs could be improved in order to close this generalization gap. In this work, we used recordings from primate high-level visual cortex (IT) to isolate whether ANNs lag behind primate generalization capabilities because of their encoder (transformations up to the penultimate layer), or their decoder (linear transformation into class labels). Specifically, we fit a linear decoder on images from one domain and evaluate transfer performance on twelve held-out domains, comparing fitting on primate IT representations vs. representations in ANN penultimate layers. To fairly compare, we scale the number of each ANN{\textquoteright}s units so that its in-domain performance matches that of the sampled IT population (i.e. 71 IT neural sites, 73\% binary-choice accuracy). We find that the sampled primate population achieves, on average, 68\% performance on the held-out-domains. Comparably sampled populations from ANN model units generalize less well, maintaining on average 60\%. This is independent of the number of sampled units: models{\textquoteright} out-of-domain accuracies consistently lag behind primate IT. These results suggest that making ANN model units more like primate IT will improve the generalization performance of ANNs.

}, url = {https://openreview.net/forum?id=iPF7mhoWkOl}, author = {Ayu Marliawaty I Gusti Bagus and Tiago Marques and Sachi Sanghavi and James J. DiCarlo and Martin Schrimpf} } @book {5294, title = {Quality Early Learning: Nurturing Children{\textquoteright}s Potential}, year = {2022}, publisher = {The World Bank}, organization = {The World Bank}, isbn = {978-1-4648-1795-3}, doi = {10.1596/978-1-4648-1795-3}, url = {http://elibrary.worldbank.org/doi/book/10.1596/978-1-4648-1795-3}, editor = {Bendini, Magdalena and Devercelli, Amanda} } @article {5299, title = {Quantifying the Emergence of Symbolic Communication}, journal = {CogSci}, year = {2022}, abstract = {

We quantitatively study the emergence of symbolic communication in humans with a communication game that attempts to recapitulate an essential step in the development of human language: the emergence of shared signs. In our experiment, a teacher must communicate a first order logic formula to a student through a narrow channel deprived of common shared signs: subjects cannot communicate with each other with the sole exception of car motions in a computer game. Subjects spontaneously develop a shared vocabulary of car motions including indices, icons, and symbols, spanning both task-specific and task-agnostic concepts such as "square{\textquoteright}{\textquoteright} and "understand{\textquoteright}{\textquoteright}. We characterize the conditions under which indices, icons, and symbols arise, finding that symbols are harder to establish than icons and indices. We observe the dominant sign category being developed transitions from indices to icons to symbols, and identify communicating in ambiguous game environments as a pressure for icon and symbol development.

}, url = {https://escholarship.org/uc/item/08n3293v}, author = {Emily Cheng and Yen-Ling Kuo and Josefina Correa and Boris Katz and Ignacio Cases and Andrei Barbu} } @proceedings {5155, title = {Reasoning about the antecedents of emotions: Bayesian causal inference over an intuitive theory of mind}, volume = {44}, year = {2022}, month = {07/2022}, pages = {854-861}, address = {Toronto, CA}, abstract = {It is commonly believed that expressions visually signal rich diagnostic information to human observers. We studied how observers interpret the dynamic expressions that people spontaneously produced during a real-life high-stakes televised game. We find that human observers are remarkably poor at recovering what events elicited others{\textquoteright} facial and bodily expressions. Beyond simple inaccuracy, people{\textquoteright}s causal reasoning exhibits systematic model-based patterns of errors. We show that latent emotion representations can explain people{\textquoteright}s reasoning about the unseen causes of expressions. A hierarchical Bayesian model simulates which events people infer to be the cause of others{\textquoteright} expressions by comparing the emotions inferred from the expressions against the emotions people were predicted to experience in various situations. This causal model provides a close, parameter-free fit to human causal judgments, suggesting that humans interpret expressions in the context of emotion predictions generated by a causally-structured mental model of other minds.}, keywords = {Affective Cognition, Bayesian Theory of Mind, Causal Reasoning, Emotion Recognition, Emotion Understanding, intuitive theory}, url = {https://escholarship.org/uc/item/7sn3w3n2}, author = {Sean Dae Houlihan and Desmond Ong and Maddie Cusimano and Rebecca Saxe} } @article {5173, title = {Representation Learning in Sensory Cortex: a theory}, journal = {IEEE Access}, year = {2022}, month = {09/2022}, pages = {1 - 1}, abstract = {

We review and apply a computational theory based on the hypothesis that the feedforward path of the ventral stream in visual cortex{\textquoteright}s main function is the encoding of invariant representations of images. A key justification of the theory is provided by a result linking invariant representations to small sample complexity for image recognition - that is, invariant representations allow learning from very few labeled examples. The theory characterizes how an algorithm that can be implemented by a set of "simple" and "complex" cells - a "Hubel Wiesel module" {\textendash} provides invariant and selective representations. The invariance can be learned in an unsupervised way from observed transformations. Our results show that an invariant representation implies several properties of the ventral stream organization, including the emergence of Gabor receptive filelds and specialized areas. The theory requires two stages of processing: the first, consisting of retinotopic visual areas such as V1, V2 and V4 with generic neuronal tuning, leads to representations that are invariant to translation and scaling; the second, consisting of modules in IT (Inferior Temporal cortex), with class- and object-specific tuning, provides a representation for recognition with approximate invariance to class specific transformations, such as pose (of a body, of a face) and expression. In summary, our theory is that the ventral stream{\textquoteright}s main function is to implement the unsupervised learning of "good" representations that reduce the sample complexity of the final supervised learning stage.

}, keywords = {Artificial neural networks, Hubel Wiesel model, Invariance, Sample Complexity, Simple and Complex cells, visual cortex}, doi = {10.1109/ACCESS.2022.3208603}, url = {https://ieeexplore.ieee.org/document/9899392/}, author = {Anselmi, Fabio and Tomaso Poggio} } @article {5256, title = {Robust Feature-Level Adversaries are Interpretability Tools}, year = {2022}, month = {10/2022}, address = {New Orleans, Louisiana}, abstract = {

The literature on adversarial attacks in computer vision typically focuses on pixel-level perturbations. These tend to be very difficult to interpret. Recent work that manipulates the latent representations of image generators to create "feature-level" adversarial perturbations gives us an opportunity to explore perceptible, interpretable adversarial attacks. We make three contributions. First, we observe that feature-level attacks provide useful classes of inputs for studying representations in models. Second, we show that these adversaries are uniquely versatile and highly robust. We demonstrate that they can be used to produce targeted, universal, disguised, physically-realizable, and black-box attacks at the ImageNet scale. Third, we show how these adversarial images can be used as a practical interpretability tool for identifying bugs in networks. We use these adversaries to make predictions about spurious associations between features and classes which we then test by designing "copy/paste" attacks in which one natural image is pasted into another to cause a targeted misclassification. Our results suggest that feature-level attacks are a promising approach for rigorous interpretability research. They support the design of tools to better understand what a model has learned and diagnose brittle feature associations. Code is available at https://github.com/thestephencasper/feature_level_adv.

}, keywords = {Adversarial Attacks, Explainability, Interpretability}, url = {https://openreview.net/forum?id=lQ--doSB2o}, author = {Stephen Casper and Max Nadeau and Dylan Hadfield-Menell and Gabriel Kreiman} } @article {5300, title = {Scalable Causal Discovery with Score Matching}, year = {2022}, month = {09/2022}, abstract = {

This paper demonstrates how to discover the whole causal graph from the second derivative of the log-likelihood in observational non-linear additive Gaussian noise models. Leveraging scalable machine learning approaches to approximate the score function , we extend the work of Rolland et al., 2022, that only recovers the topological order from the score and requires an expensive pruning step to discover the edges. Our analysis leads to DAS, a practical algorithm that reduces the complexity of the pruning by a factor proportional to the graph size. In practice, DAS achieves competitive accuracy with current state-of-the-art while being over an order of magnitude faster. Overall, our approach enables principled and scalable causal discovery, significantly lowering the compute bar.

}, url = {https://openreview.net/forum?id=v56PHv_W2A}, author = {Francesco Montagna and Nicoletta Noceti and Lorenzo Rosasco and Kun Zhang and Francesco Locatello} } @article {5082, title = {SGD Noise and Implicit Low-Rank Bias in Deep Neural Networks}, year = {2022}, month = {03/2022}, abstract = {

We analyze deep ReLU neural networks trained with mini-batch stochastic gradient decent and weight decay. We prove that the source of the SGD noise is an implicit low rank constraint across all of the weight matrices within the network. Furthermore, we show, both theoretically and empirically, that when training a neural network using Stochastic Gradient Descent (SGD) with a small batch size, the resulting weight matrices are expected to be of small rank. Our analysis relies on a minimal set of assumptions and the neural networks may include convolutional layers, residual connections, as well as batch normalization layers.

}, author = {Tomer Galanti and Tomaso Poggio} } @conference {5301, title = {Spontaneous sign emergence in humans and machines through an embodied communication game}, booktitle = {JCoLE Workshop}, year = {2022}, author = {Emily Cheng and Yen-Ling Kuo and Ignacio Cases and Boris Katz and Andrei Barbu} } @article {5143, title = {Stochastic consolidation of lifelong memoryAbstract}, journal = {Scientific Reports}, volume = {12}, year = {2022}, month = {07/2022}, abstract = {

Humans have the remarkable ability to continually store new memories, while maintaining old memories for a lifetime. How the brain avoids catastrophic forgetting of memories due to interference between encoded memories is an open problem in computational neuroscience. Here we present a model for continual learning in a recurrent neural network combining Hebbian learning, synaptic decay and a novel memory consolidation mechanism: memories undergo stochastic rehearsals with rates proportional to the memory{\textquoteright}s basin of attraction, causing self-amplified consolidation. This mechanism gives rise to memory lifetimes that extend much longer than the synaptic decay time, and retrieval probability of memories that gracefully decays with their age. The number of retrievable memories is proportional to a power of the number of neurons. Perturbations to the circuit model cause temporally-graded retrograde and anterograde deficits, mimicking observed memory impairments following neurological trauma.

}, doi = {10.1038/s41598-022-16407-9}, url = {https://www.nature.com/articles/s41598-022-16407-9}, author = {Shaham, Nimrod and Chandra, Jay and Gabriel Kreiman and Sompolinsky, Haim} } @article {5224, title = {Synthesizing theories of human language with Bayesian program inductionAbstract}, journal = {Nature Communications}, volume = {13}, year = {2022}, month = {08/2022}, abstract = {

Automated, data-driven construction and evaluation of scientific models and theories is a long-standing challenge in artificial intelligence. We present a framework for algorithmically synthesizing models of a basic part of human language: morpho-phonology, the system that builds word forms from sounds. We integrate Bayesian inference with program synthesis and representations inspired by linguistic theory and cognitive models of learning and discovery. Across 70 datasets from 58 diverse languages, our system synthesizes human-interpretable models for core aspects of each language{\textquoteright}s morpho-phonology, sometimes approaching models posited by human linguists. Joint inference across all 70 data sets automatically synthesizes a meta-model encoding interpretable cross-language typological tendencies. Finally, the same algorithm captures few-shot learning dynamics, acquiring new morphophonological rules from just one or a few examples. These results suggest routes to more powerful machine-enabled discovery of interpretable models in linguistics and other scientific domains.

}, doi = {10.1038/s41467-022-32012-w}, url = {https://www.nature.com/articles/s41467-022-32012-w}, author = {Ellis, Kevin and Albright, Adam and Solar-Lezama, Armando and Tenenbaum, Joshua B. and O{\textquoteright}Donnell, Timothy J.} } @article {5133, title = {System identification of neural systems: If we got it right, would we know?}, year = {2022}, month = {07/2022}, abstract = {

Various artificial neural networks developed by engineers have been evaluated as models of the brain, such as the ventral stream in the primate visual cortex. After being trained on large datasets, the network outputs are compared to recordings of biological neurons. Good performance in reproducing neural responses is taken as validation for the model. This system identification approach is different from the traditional ways to test theories and associated models in the natural sciences. Furthermore, it lacks a clear foundation in terms of theory and empirical validation. Here we begin characterizing some of these emerging approaches: what do they tell us? To address this question, we benchmark their ability to correctly identify a model by replacing the brain recordings with recordings from a known ground truth model. We evaluate commonly used identification techniques such as neural regression (linear regression on a population of model units) and centered kernel alignment (CKA). Even in the setting where the correct model is among the candidates, we find that the performance of these approaches at system identification is quite variable; it also depends significantly on factors independent of the ground truth architecture, such as scoring function and dataset.

}, author = {Yena Han and Tomaso Poggio and Brian Cheung} } @article {5032, title = {Task-specific neural processes underlying conflict resolution during cognitive control}, journal = {BioRxiv}, year = {2022}, month = {01/2022}, abstract = {

Cognitive control involves flexibly combining multiple sensory inputs with task-dependent goals during decision making. Several tasks have been proposed to examine cognitive control, including Stroop, Eriksen-Flanker, and the Multi-source interference task. Because these tasks have been studied independently, it remains unclear whether the neural signatures of cognitive control reflect abstract control mechanisms or specific combinations of sensory and behavioral aspects of each task. To address this question, here we recorded invasive neurophysiological signals from 16 subjects and directly compared the three tasks against each other. Neural activity patterns in the theta and high-gamma frequency bands differed between incongruent and congruent conditions, revealing strong modulation by conflicting task demands. These neural signals were specific to each task, generalizing within a task but not across tasks. These results highlight the complex interplay between sensory inputs, motor outputs, and task demands and argue against a universal and abstract representation of conflict.

}, doi = {10.1101/2022.01.16.476535 }, url = {https://www.biorxiv.org/content/10.1101/2022.01.16.476535}, author = {Yuchen Xiao and Chien-Chen Chou and Garth Rees Cosgrove and Nathan E Crone and Scellig Stone and Joseph R Madsen and Ian Reucroft and Yen-Cheng Shih and Daniel Weisholtz and Hsiang-Yu Yu and William S. Anderson and Gabriel Kreiman} } @article {5011, title = {Three approaches to facilitate DNN generalization to objects in out-of-distribution orientations and illuminations}, number = {119}, year = {2022}, month = {01/2022}, abstract = {

The training data distribution is often biased towards objects in certain orientations and illumination conditions. While humans have a remarkable capability of recognizing objects in out-of-distribution (OoD) orientations and illuminations, Deep Neural Networks (DNNs) severely suffer in this case, even when large amounts of training examples are available. In this paper, we investigate three different approaches to improve DNNs in recognizing objects in OoD orientations and illuminations. Namely, these are (i) training much longer after convergence of the in-distribution (InD) validation accuracy, i.e., late-stopping, (ii) tuning the momentum parameter of the batch normalization layers, and (iii) enforcing invariance of the neural activity in an intermediate layer to orientation and illumination conditions. Each of these approaches substantially improves the DNN{\textquoteright}s OoD accuracy (more than 20\% in some cases). We report results in four datasets: two datasets are modified from the MNIST and iLab datasets, and the other two are novel (one of 3D rendered cars and another of objects taken from various controlled orientations and illumination conditions). These datasets allow to study the effects of different amounts of bias and are challenging as DNNs perform poorly in OoD conditions. Finally, we demonstrate that even though the three approaches focus on different aspects of DNNs, they all tend to lead to the same underlying neural mechanism to enable OoD accuracy gains {\textendash} individual neurons in the intermediate layers become more selective to a category and also invariant to OoD orientations and illuminations. We anticipate this study to be a basis for further improvement of deep neural networks{\textquoteright} OoD generalization performance, which is highly demanded to achieve safe and fair AI applications.

}, author = {Akira Sakai and Taro Sunagawa and Spandan Madan and Kanata Suzuki and Takashi Katoh and Hiromichi Kobashi and Hanspeter Pfister and Pawan Sinha and Xavier Boix and Tomotake Sasaki} } @article {5304, title = {Toddlers{\textquoteright} social evaluations of agents who act on false beliefs}, journal = {Developmental Science}, volume = {26}, year = {2022}, month = {08/2022}, abstract = {

Mature social evaluations privilege agents{\textquoteright} intentions over the outcomes of their actions, but young children often privilege outcomes over intentions in verbal tasks probing their social evaluations. In three experiments (N =\ 118), we probed the development of intention-based social evaluation and mental state reasoning using nonverbal methods with 15-month-old toddlers. Toddlers viewed scenarios depicting a protagonist who sought to obtain one of two toys, each inside a different box, as two other agents observed. Then, the boxes{\textquoteright} contents were switched in the absence of the protagonist and either in the presence or the absence of the other agents. When the protagonist returned, one agent opened the box containing the protagonist{\textquoteright}s desired toy (a positive outcome), and the other opened the other box (a neutral outcome). When both agents had observed the toys move to their current locations, the toddlers preferred the agent who opened the box containing the desired toy. In contrast, when the agents had not seen the toys move and therefore should have expected the desired toy{\textquoteright}s location to be unchanged, the toddlers preferred the agent who opened the box that no longer contained the desired toy. Thus, the toddlers preferred the agent who intended to make the protagonist{\textquoteright}s desired toy accessible, even when its action, guided by a false belief concerning that toy{\textquoteright}s location, did not produce a positive outcome. Well before children connect beliefs to social behavior in verbal tasks, toddlers engage in intention-based evaluations of social agents with false beliefs.

}, issn = {1363-755X}, doi = {10.1111/desc.v26.210.1111/desc.13314}, url = {https://onlinelibrary.wiley.com/toc/14677687/26/2}, author = {Woo, Brandon M. and Spelke, Elizabeth S.} } @article {5305, title = {Towards an objective characterization of an individual{\textquoteright}s facial movements using Self-Supervised Person-Specific-Models}, journal = {arXiv}, year = {2022}, month = {11/2022}, abstract = {

Disentangling facial movements from other facial characteristics, particularly from facial identity, remains a challenging task, as facial movements display great variation between individuals. In this paper, we aim to characterize individual-specific facial movements. We present a novel training approach to learn facial movements independently of other facial characteristics, focusing on each individual separately. We propose self-supervised Person-Specific Models (PSMs), in which one model per individual can learn to extract an embedding of the facial movements independently of the person{\textquoteright}s identity and other structural facial characteristics from unlabeled facial video. These models are trained using encoder-decoder-like architectures. We provide quantitative and qualitative evidence that a PSM learns a meaningful facial embedding that discovers fine-grained movements otherwise not characterized by a General Model (GM), which is trained across individuals and characterizes general patterns of facial movements. We present quantitative and qualitative evidence that this approach is easily scalable and generalizable for new individuals: facial movements knowledge learned on a person can quickly and effectively be transferred to a new person. Lastly, we propose a novel PSM using curriculum temporal learning to leverage the temporal contiguity between video frames. Our code, analysis details, and all pretrained models are available in Github and Supplementary Materials.

}, url = {https://arxiv.org/abs/2211.08279}, author = {Yanis Tazi and Michael Berger and Winrich A. Freiwald} } @conference {5306, title = {Trajectory Prediction with Linguistic Representations}, booktitle = {2022 IEEE International Conference on Robotics and Automation (ICRA)}, year = {2022}, address = {Philadelphia, PA, USA}, abstract = {

Language allows humans to build mental models that interpret what is happening around them resulting in more accurate long-term predictions. We present a novel trajectory prediction model that uses linguistic intermediate representations to forecast trajectories, and is trained using trajectory samples with partially-annotated captions. The model learns the meaning of each of the words without direct per-word supervision. At inference time, it generates a linguistic description of trajectories which captures maneuvers and interactions over an extended time interval. This generated description is used to refine predictions of the trajectories of multiple agents. We train and validate our model on the Argoverse dataset, and demonstrate improved accuracy results in trajectory prediction. In addition, our model is more interpretable: it presents part of its reasoning in plain language as captions, which can aid model development and can aid in building confidence in the model before deploying it.

}, doi = {10.1109/ICRA46639.2022.9811928}, url = {https://ieeexplore.ieee.org/document/9811928}, author = {Kuo, Yen-Ling and Huang, Xin and Barbu, Andrei and McGill, Stephen G. and Katz, Boris and Leonard, John J. and Rosman, Guy} } @article {5051, title = {Trajectory Prediction with Linguistic Representations}, year = {2022}, abstract = {

Language allows humans to build mental models that interpret what is happening around them resulting in more accurate long-term predictions. We present a novel trajectory prediction model that uses linguistic intermediate representations to forecast trajectories, and is trained using trajectory sam- ples with partially-annotated captions. The model learns the meaning of each of the words without direct per-word supervision. At inference time, it generates a linguistic description of trajectories which captures maneuvers and interactions over an extended time interval. This generated description is used to refine predictions of the trajectories of multiple agents. We train and validate our model on the Argoverse dataset, and demonstrate improved accuracy results in trajectory prediction. In addition, our model is more interpretable: it presents part of its reasoning in plain language as captions, which can aid model development and can aid in building confidence in the model before deploying it.

}, author = {Yen-Ling Kuo and Xin Huang and Andrei Barbu and Stephen G. McGill and Boris Katz and John J. Leonard and Guy Rosman} } @article {5015, title = {Transformer Module Networks for Systematic Generalization in Visual Question Answering}, number = {121}, year = {2022}, month = {02/2022}, abstract = {Transformer-based models achieve great performance on Visual Question Answering (VQA). How- ever, when we evaluate them on systematic generalization, i.e., handling novel combinations of known concepts, their performance degrades. Neural Module Networks (NMNs) are a promising approach for systematic generalization that consists on composing modules, i.e., neural networks that tackle a sub-task. Inspired by Transformers and NMNs, we propose Transformer Module Network (TMN), a novel Transformer-based model for VQA that dynamically composes modules into a question-specific Transformer network. TMNs achieve state-of-the-art systematic generalization performance in three VQA datasets, namely, CLEVR-CoGenT, CLOSURE and GQA-SGL, in some cases improving more than 30\% over standard Transformers. }, author = {Moyuru Yamada and Vanessa D{\textquoteright}Amario and Kentaro Takemoto and Xavier Boix and Tomotake Sasaki} } @article {5137, title = {Understanding the Role of Recurrent Connections in Assembly Calculus}, year = {2022}, month = {07/2022}, abstract = {

In this note, we explore the role of recurrent connections in Assembly Calculus through a number of experiments\ conducted on models with and without recurrent connections. We observe that assemblies can be formed even in the absence of\ recurrent connections, but also find that models with recurrent connections are more robust to noisy inputs. We also investigate\ the spectral structure of the synaptic weights and find intriguing similarities between models of neural assemblies and\ associative memories.

}, author = {Akshay Rangamani and Yi Xie} } @article {5090, title = {Using child-friendly movie stimuli to study the development of face, place, and object regions from age 3 to 12 years}, journal = {Human Brain Mapping}, year = {2022}, month = {03/2022}, abstract = {

Scanning young children while they watch short, engaging, commercially-produced movies has emerged as a promising approach for increasing data retention and quality. Movie stimuli also evoke a richer variety of cognitive processes than traditional experiments, allowing the study of multiple aspects of brain development simultaneously. However, because these stimuli are uncontrolled, it is unclear how effectively distinct profiles of brain activity can be distinguished from the resulting data. Here we develop an approach for identifying multiple distinct subject-specific Regions of Interest (ssROIs) using fMRI data collected during movie-viewing. We focused on the test case of higher-level visual regions selective for faces, scenes, and objects. Adults (N = 13) were scanned while viewing a 5.6-min child-friendly movie, as well as a traditional localizer experiment with blocks of faces, scenes, and objects. We found that just 2.7 min of movie data could identify subject-specific face, scene, and object regions. While successful, movie-defined ssROIS still showed weaker domain selectivity than traditional ssROIs. Having validated our approach in adults, we then used the same methods on movie data collected from 3 to 12-year-old children (N = 122). Movie response timecourses in 3-year-old children{\textquoteright}s face, scene, and object regions were already significantly and specifically predicted by timecourses from the corresponding regions in adults. We also found evidence of continued developmental change, particularly in the face-selective posterior superior temporal sulcus. Taken together, our results reveal both early maturity and functional change in face, scene, and object regions, and more broadly highlight the promise of short, child-friendly movies for developmental cognitive neuroscience.

}, issn = {1065-9471}, doi = {10.1002/hbm.25815}, url = {https://onlinelibrary.wiley.com/doi/10.1002/hbm.25815}, author = {Kamps, Frederik S. and Richardson, Hilary and N. Apurva Ratan Murty and Nancy Kanwisher and Rebecca Saxe} } @article {5307, title = {Using machine learning to understand age and gender classification based on infant temperament}, journal = {PLOS ONE}, volume = {17}, year = {2022}, month = {04/2022}, pages = {e0266026}, abstract = {

Age and gender differences are prominent in the temperament literature, with the former particularly salient in infancy and the latter noted as early as the first year of life. This study represents a meta-analysis utilizing Infant Behavior Questionnaire-Revised (IBQ-R) data collected across multiple laboratories (N = 4438) to overcome limitations of smaller samples in elucidating links among temperament, age, and gender in early childhood. Algorithmic modeling techniques were leveraged to discern the extent to which the 14 IBQ-R subscale scores accurately classified participating children as boys (n = 2,298) and girls (n = 2,093), and into three age groups: youngest (\< 24 weeks; n = 1,102), mid-range (24 to 48 weeks; n = 2,557), and oldest (\> 48 weeks; n = 779). Additionally, simultaneous classification into age and gender categories was performed, providing an opportunity to consider the extent to which gender differences in temperament are informed by infant age. Results indicated that overall age group classification was more accurate than child gender models, suggesting that age-related changes are more salient than gender differences in early childhood with respect to temperament attributes. However, gender-based classification was superior in the oldest age group, suggesting temperament differences between boys and girls are accentuated with development. Fear emerged as the subscale contributing to accurate classifications most notably overall. This study leads infancy research and meta-analytic investigations more broadly in a new direction as a methodological demonstration, and also provides most optimal comparative data for the IBQ-R based on the largest and most representative dataset to date.

}, doi = {10.1371/journal.pone.0266026}, url = {https://dx.plos.org/10.1371/journal.pone.0266026}, author = {Gartstein, Maria A. and Seamon, D. Erich and Mattera, Jennifer A. and Bosquet Enlow, Michelle and Wright, Rosalind J. and Perez-Edgar, Koraly and Buss, Kristin A. and LoBue, Vanessa and Bell, Martha Ann and Goodman, Sherryl H. and Spieker, Susan and Bridgett, David J. and Salisbury, Amy L. and Gunnar, Megan R. and Mliner, Shanna B. and Muzik, Maria and Stifter, Cynthia A. and Planalp, Elizabeth M. and Mehr, Samuel A. and Spelke, Elizabeth S. and Lukowski, Angela F. and Groh, Ashley M. and Lickenbrock, Diane M. and Santelli, Rebecca and Du Rocher Schudlich, Tina and Anzman-Frasca, Stephanie and Thrasher, Catherine and Diaz, Anjolii and Dayton, Carolyn and Moding, Kameron J. and Jordan, Evan M.}, editor = {Siuly, Siuly} } @article {5308, title = {Visual foundations of Euclidean geometry}, journal = {Cognitive Psychology}, volume = {136}, year = {2022}, month = {08/2022}, pages = {101494}, abstract = {

Geometry defines entities that can be physically realized in space, and our knowledge of abstract geometry may therefore stem from our representations of the physical world. Here, we focus on Euclidean geometry, the geometry historically regarded as {\textquotedblleft}natural{\textquotedblright}. We examine whether humans possess representations describing visual forms in the same way as Euclidean geometry {\textendash} i.e., in terms of their shape and size. One hundred and twelve participants from the U.S. (age 3{\textendash}34 years), and 25 participants from the Amazon (age 5{\textendash}67 years) were asked to locate geometric deviants in panels of 6 forms of variable orientation. Participants of all ages and from both cultures detected deviant forms defined in terms of shape or size, while only U.S. adults drew distinctions between mirror images (i.e. forms differing in {\textquotedblleft}sense{\textquotedblright}). Moreover, irrelevant variations of sense did not disrupt the detection of a shape or size deviant, while irrelevant variations of shape or size did. At all ages and in both cultures, participants thus retained the same properties as Euclidean geometry in their analysis of visual forms, even in the absence of formal instruction in geometry. These findings show that representations of planar visual forms provide core intuitions on which humans{\textquoteright} knowledge in Euclidean geometry could possibly be grounded.

}, issn = {00100285}, doi = {10.1016/j.cogpsych.2022.101494}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010028522000317}, author = {Izard, V{\'e}ronique and Pica, Pierre and Spelke, Elizabeth S.} } @article {5309, title = {Visual motion perception as online hierarchical inference}, journal = {Nature Communications}, volume = {13}, year = {2022}, month = {12/2022}, abstract = {

Identifying the structure of motion relations in the environment is critical for navigation, tracking, prediction, and pursuit. Yet, little is known about the mental and neural computations that allow the visual system to infer this structure online from a volatile stream of visual information. We propose online hierarchical Bayesian inference as a principled solution for how the brain might solve this complex perceptual task. We derive an online Expectation-Maximization algorithm that explains human percepts qualitatively and quantitatively for a diverse set of stimuli, covering classical psychophysics experiments, ambiguous motion scenes, and illusory motion displays. We thereby identify normative explanations for the origin of human motion structure perception and make testable predictions for future psychophysics experiments. The proposed online hierarchical inference model furthermore affords a neural network implementation which shares properties with motion-sensitive cortical areas and motivates targeted experiments to reveal the neural representations of latent structure.

}, doi = {10.1038/s41467-022-34805-5}, url = {https://www.nature.com/articles/s41467-022-34805-5}, author = {Bill, Johannes and Gershman, Samuel J. and Drugowitsch, Jan} } @book {5310, title = {What Babies KnowAbstractCore KnowledgeAbstract}, year = {2022}, month = {08/2022}, pages = {190 - C5.T1}, publisher = {Oxford University PressNew York}, organization = {Oxford University PressNew York}, edition = {1}, abstract = {

Research on infants{\textquoteright} knowledge of objects, places, and number provides evidence for core cognitive systems that capture abstract, interconnected concepts and are early-emerging, present throughout life, innate, and supportive of learning. These systems also are ancient (they are shared by a wide range of other animals), sharply limited, unconscious, automatically activated, and dependent on our limited attentional resources. Here the author suggests that these properties collectively form a natural kind: a cognitive system with some of these properties will likely have all of them. Notably, a cognitive system shared by diverse, distantly related animals will gain a primordial blessing of abstraction: For example, the core place system will represent only the abstract geometric properties that apply to all the environments of the navigating animals that possess it, including terrestrial rats, flying birds, and aquatic fish. In the coming chapters, the author uses these arguments to propose three more systems of core knowledge.

}, isbn = {01906182489780190618247}, doi = {10.1093/oso/9780190618247.001.000110.1093/oso/9780190618247.003.0005}, url = {https://academic.oup.com/book/43912}, author = {Spelke, Elizabeth S.} } @article {5311, title = {What Could Go Wrong: Adults and Children Calibrate Predictions and Explanations of Others{\textquoteright} Actions Based on Relative Reward and Danger}, journal = {Cognitive Science}, volume = {46}, year = {2022}, month = {06/2022}, abstract = {

When human adults make decisions (e.g., wearing a seat belt), we often consider the negative consequences that would ensue if our actions were to fail, even if we have never experienced such a failure. Do the same considerations guide our understanding of other people{\textquoteright}s decisions? In this paper, we investigated whether adults, who have many years of experience making such decisions, and 6- and 7-year-old children, who have less experience and are demonstrably worse at judging the consequences of their own actions, conceive others{\textquoteright} actions as motivated both by reward (how good reaching one{\textquoteright}s intended goal would be), and by what we call {\textquotedblleft}danger{\textquotedblright} (how badly one{\textquoteright}s action could end). In two pre-registered experiments, we tested whether adults and 6- and 7-year-old children tailor their predictions and explanations of an agent{\textquoteright}s action choices to the specific degree of danger and reward entailed by each action. Across four different tasks, we found that children and adults expected others to negatively appraise dangerous situations and minimize the danger of their actions. Children{\textquoteright}s and adults{\textquoteright} judgments varied systematically in accord with both the degree of danger the agent faced and the value the agent placed on the goal state it aimed to achieve. However, children did not calibrate their inferences about how much an agent valued the goal state of a successful action in accord with the degree of danger the action entailed, and adults calibrated these inferences more weakly than inferences concerning the agent{\textquoteright}s future action choices. These results suggest that from childhood, people use a degree of danger and reward to make quantitative, fine-grained explanations and predictions about other people{\textquoteright}s behavior, consistent with computational models on theory of mind that contain continuous representations of other agents{\textquoteright} action plans.

}, issn = {0364-0213}, doi = {10.1111/cogs.v46.710.1111/cogs.13163}, url = {https://onlinelibrary.wiley.com/toc/15516709/46/7}, author = {Gjata, Nensi N. and Ullman, Tomer D. and Spelke, Elizabeth S. and Liu, Shari} } @article {5025, title = {When and how convolutional neural networks generalize to out-of-distribution category{\textendash}viewpoint combinations}, journal = {Nature Machine Intelligence}, volume = {4}, year = {2022}, month = {02/2022}, pages = {146 - 153}, abstract = {

Object recognition and viewpoint estimation lie at the heart of visual understanding. Recent studies have suggested that convolutional neural networks (CNNs) fail to generalize to out-of-distribution (OOD) category{\textendash}viewpoint combinations, that is, combinations not seen during training. Here we investigate when and how such OOD generalization may be possible by evaluating CNNs trained to classify both object category and three-dimensional viewpoint on OOD combinations, and identifying the neural mechanisms that facilitate such OOD generalization. We show that increasing the number of in-distribution combinations (data diversity) substantially improves generalization to OOD combinations, even with the same amount of training data. We compare learning category and viewpoint in separate and shared network architectures, and observe starkly different trends on in-distribution and OOD combinations, that is, while shared networks are helpful in distribution, separate networks significantly outperform shared ones at OOD combinations. Finally, we demonstrate that such OOD generalization is facilitated by the neural mechanism of specialization, that is, the emergence of two types of neuron{\textemdash}neurons selective to category and invariant to viewpoint, and vice versa.

}, doi = {10.1038/s42256-021-00437-5}, url = {https://www.nature.com/articles/s42256-021-00437-5}, author = {Madan, Spandan and Henry, Timothy and Dozier, Jamell and Ho, Helen and Bhandari, Nishchal and Sasaki, Tomotake and Durand, Fr{\'e}do and Pfister, Hanspeter and Boix, Xavier} } @article {5065, title = { AGENT: A Benchmark for Core Psychological Reasoning}, year = {2021}, month = {07/2021}, author = {Tianmin Shu and Abhishek Bhandwaldar and Chuang Gan and Kevin A Smith and Shari Liu and Dan Gutfreund and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman} } @article {5041, title = {Beauty is in the eye of the machine}, journal = {Nature Human Behaviour}, volume = {5}, year = {2021}, month = {05/2021}, pages = {675 - 676}, abstract = {

Ansel Adams said, {\textquotedblleft}There are no rules for good photographs, there are only good photographs.{\textquotedblright} Is it possible to predict our fickle and subjective appraisal of {\textquoteleft}aesthetically pleasing{\textquoteright} visual art? Iigaya et al. used an artificial intelligence approach to show how human aesthetic preference can be partially explained as an integration of hierarchical constituent image features.

Artificial intelligence (AI) has made rapid strides in a wide range of visual tasks, including recognition of objects and faces, automatic diagnosis of clinical images, and answering questions about images. More recently, AI has also started penetrating the arts. For example, in October 2018, the first piece of AI-generated art came to auction, with an initial estimate of US$ 10,000, and strikingly garnered a final bid of US$ 432,500 (Fig. 1). The portrait depicts a portly gentleman with a seemingly fuzzy facial expression, dressed in a black frockcoat with a white collar. Appreciating and creating a piece of art requires a general understanding of aesthetics. What are the nuances, structures, and semantics embedded in a painting that can provide us with an aesthetically pleasing sense?

}, doi = {10.1038/s41562-021-01125-5}, url = {http://www.nature.com/articles/s41562-021-01125-5}, author = {Zhang, Mengmi and Gabriel Kreiman} } @book {4790, title = {Biological and Computer Vision}, year = {2021}, month = {02/2021}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {Cambridge, UK}, abstract = {

Imagine a world where machines can see and understand the world the way humans do. Rapid progress in artificial intelligence has led to smartphones that recognize faces, cars that detect pedestrians, and algorithms that suggest diagnoses from clinical images, among many other applications. The success of computer vision is founded on a deep understanding of the neural circuits in the brain responsible for visual processing. This book introduces the neuroscientific study of neuronal computations in visual cortex alongside of the cognitive psychological understanding of visual cognition\  and the burgeoning field of biologically-inspired artificial intelligence. Topics include the neurophysiological investigation of visual cortex, visual illusions, visual disorders, deep convolutional neural networks, machine learning, and generative adversarial networks among others. It is an ideal resource for students and researchers looking to build bridges across different approaches to studying and building visual systems.

}, isbn = {978-1108705004}, issn = {1108705006}, doi = {10.1017/9781108649995}, url = {https://www.cambridge.org/core/books/biological-and-computer-vision/BB7E68A69AFE7A322F68F3C4A297F3CF}, author = {Gabriel Kreiman} } @article {4823, title = {Causal inference in environmental sound recognition}, journal = {Cognition}, year = {2021}, month = {03/2021}, abstract = {

Sound is caused by physical events in the world. Do humans infer these causes when recognizing sound sources? We tested whether the recognition of common environmental sounds depends on the inference of a basic physical variable -- the source intensity (i.e. the power that produces a sound). A source{\textquoteright}s intensity can be inferred from the intensity it produces at the ear and its distance, which is normally conveyed by reverberation. Listeners could thus use intensity at the ear and reverberation to constrain recognition by inferring the underlying source intensity. Alternatively, listeners might separate these acoustic cues from their representation of a sound{\textquoteright}s identity in the interest of invariant recognition. We compared these two hypotheses by measuring recognition accuracy for sounds with typically low or high source intensity (e.g. pepper grinders vs. trucks) that were presented across a range of intensities at the ear or with reverberation cues to distance. The recognition of low-intensity sources (e.g. pepper grinders) was impaired by high presentation intensities or reverberation that conveyed distance, either of which imply high source intensity. Neither effect occurred for high-intensity sources. The results suggest that listeners implicitly use the intensity at the ear along with distance cues to infer a source{\textquoteright}s power and constrain its identity. The recognition of real-world sounds thus appears to depend upon the inference of their physical generative parameters, even generative parameters whose cues might otherwise be separated from the representation of a sound{\textquoteright}s identity.

}, doi = {10.1016/j.cognition.2021.104627}, author = {James Traer and Sam Norman-Haignere and Josh H. McDermott} } @article {4794, title = {Characterizing a snapshot of perceptual experience.}, journal = {Journal of Experimental Psychology: General}, year = {2021}, month = {02/2021}, abstract = {

What can we perceive in a single glance of the visual world? Although this question appears rather simple, answering it has been remarkably difficult and controversial. Traditionally, researchers have tried to infer the nature of perceptual experience by examining how many objects and what types of objects are not fully encoded within a scene (e.g., failing to notice a bowl disappearing/changing). Here, we took a different approach and asked how much we could alter an entire scene before observers noticed those global alterations. Surprisingly, we found that observers could fixate on a scene for hundreds of milliseconds yet routinely fail to notice drastic changes to that scene (e.g., scrambling the periphery so no object can be identified, putting the center of 1 scene on the background of another scene). In addition, we also found that as observers allocate more attention to their periphery, their ability to notice these changes to a scene increases. Together, these results show that although a single snapshot of perceptual experience can be remarkably impoverished, it is also not a fixed constant and is likely to be continuously changing from moment to moment depending on attention. (

}, issn = {0096-3445}, doi = {10.1037/xge0000864}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/xge0000864}, author = {Cohen, Michael A. and Ostrand, Caroline and Frontero, Nicole and Pham, Phuong-Nghi} } @article {5066, title = { Chemogenetic suppression of macaque V4 neurons produces retinotopically specific deficits in downstream IT neural activity patterns and core object recognition behavior}, journal = {Journal of Vision}, volume = {21}, year = {2021}, month = {09/2021}, chapter = {2489}, abstract = {

Distributed activity patterns across multiple brain areas (e.g., V4, IT) enable primates to accurately identify visual objects. To strengthen our inferences about the causal role of underlying brain circuits, it is necessary to develop targeted neural perturbation strategies that enable discrimination amongst competing models. To probe the role of area V4 in core object recognition, we expressed inhibitory DREADDs in neurons within a 5x5 mm subregion of V4 cortex via multiple viral injections (AAV8-hSyn-hM4Di-mCherry; two macaques). To assay for successful neural suppression, we recorded from a multi-electrode array implanted over the transfected V4. We also recorded from multi-electrode arrays in the IT cortex (the primary feedforward target of V4), while simultaneously measuring the monkeys{\textquoteright} behavior during object discrimination tasks. We found that systemic (intramuscular) injection of the DREADDs activator (CNO) produced reversible reductions (~20\%) in image-evoked V4 responses compared to the control condition (saline injections). Monkeys showed significant behavioral performance deficits upon CNO injections (compared to saline), which were larger when the object position overlapped with the RF estimates of the transfected V4 neurons. This is consistent with the hypothesis that the suppressed V4 neurons are critical to this behavior. Furthermore, we observed commensurate deficits in the linearly-decoded estimates of object identity from the IT population activity (post-CNO). To model the perturbed brain circuitry, we used a primate brain-mapped artificial neural network (ANN) model (CORnet-S) that supports object recognition. We {\textquotedblleft}lesioned{\textquotedblright} the model{\textquoteright}s corresponding V4 subregion by modifying its weights such that the responses matched a subset of our experimental V4 measurements (post-CNO). Indeed, the lesioned model better predicted the measured (held-out) V4 and IT responses (post-CNO), compared to the model{\textquoteright}s non-lesioned version, validating our approach. In the future, our approach allows us to discriminate amongst competing mechanistic brain models, while the data provides constraints to guide more accurate alternatives.

}, doi = {10.1167/jov.21.9.2489}, url = {https://jov.arvojournals.org/article.aspx?articleid=2777218}, author = {Kohitij Kar and Martin Schrimpf and Kailyn Schmidt and James J. DiCarlo} } @article {5067, title = {Cognitive boundary signals in the human medial temporal lobe shape episodic memory representation}, journal = {bioRxiv}, year = {2021}, month = {01/2021}, abstract = {

While experience unfolds continuously, memories are organized as a set of discrete events that bind together the {\textquotedblleft}where{\textquotedblright}, {\textquotedblleft}when{\textquotedblright}, and {\textquotedblleft}what{\textquotedblright} of episodic memory. This segmentation of continuous experience is thought to be facilitated by the detection of salient environmental or cognitive events. However, the underlying neural mechanisms and how such segmentation shapes episodic memory representations remain unclear. We recorded from single neurons in the human medial temporal lobe while subjects watched videos with different types of embedded boundaries and were subsequently evaluated for memories of the video contents. Here we show neurons that signal the presence of cognitive boundaries between subevents from the same episode and neurons that detect the abstract separation between different episodes. The firing rate and spike timing of these boundary-responsive neurons were predictive of later memory retrieval accuracy. At the population level, abrupt neural state changes following boundaries predicted enhanced memory strength but impaired order memory, capturing the behavioral tradeoff subjects exhibited when recalling episodic content versus temporal order. Successful retrieval was associated with reinstatement of the neural state present following boundaries, indicating that boundaries structure memory search. These findings reveal a neuronal substrate for detecting cognitive boundaries and show that cognitive boundary signals facilitate the mnemonic organization of continuous experience as a set of discrete episodic events.

}, author = {Jie Zheng and Andrea G{\'o}mez Palacio Schjetnan and Mar Yebra and Clayton Mosher and Suneil Kalia and Taufik A. Valiante and Adam N. Mamelak and Gabriel Kreiman and Ueli Rutishauser} } @conference {5068, title = {Combining Different V1 Brain Model Variants to Improve Robustness to Image Corruptions in CNNs}, booktitle = {NeurIPS 2021}, year = {2021}, month = {12/2021}, abstract = {

While some convolutional neural networks (CNNs) have surpassed human visual abilities in object classification, they often struggle to recognize objects in images corrupted with different types of common noise patterns, highlighting a major limitation of this family of models. Recently, it has been shown that simulating a primary visual cortex (V1) at the front of CNNs leads to small improvements in robustness to these image perturbations. In this study, we start with the observation that different variants of the V1 model show gains for specific corruption types. We then build a new model using an ensembling technique, which combines multiple individual models with different V1 front-end variants. The model ensemble leverages the strengths of each individual model, leading to significant improvements in robustness across all corruption categories and outperforming the base model by 38\% on average. Finally, we show that using distillation it is possible to partially compress the knowledge in the ensemble model into a single model with a V1 front-end. While the ensembling and distillation techniques used here are hardly biologically-plausible, the results presented here demonstrate that by combining the specific strengths of different neuronal circuits in V1 it is possible to improve the robustness of CNNs for a wide range of perturbations.

}, url = {https://nips.cc/Conferences/2021/ScheduleMultitrack?event=41268}, author = {Avinash Baidya and Joel Dapello and James J. DiCarlo and Tiago Marques} } @proceedings {4928, title = {Competition from novel features drives scalar inferences in reference games}, volume = {43}, year = {2021}, month = {07/2021}, abstract = {

Scalar implicatures, one of the signatures of pragmatic reasoning, are believed to arise from competing alternative utterances, which the listener knows that the speaker could have used to express a strengthened meaning. But do scalar implicatures also arise in the presence of nonce objects, for which no alternative name is known? We conduct a series of experiments assessing the degree of scalar strengthening driven by familiar and nonce objects. We find that nonce objects can derive scalar implicatures as strongly as familiar objects in simple reference games. Our experiments also reveal an asymmetry in the relative strengths of familiar- and nonce-driven inferences: relative to the prior, participants preferentially interpret the name of a shared feature as referring to an object with an additional nonce feature over an object with an additional familiar feature, suggesting that familiar alternatives exert greater scalar pressure than nonce alternatives. We also present exploratory model simulations suggesting that our results may be explained by rationally reasoning about a high-cost description of the novel object. Our findings support the idea that novel lexical entries may be generated from one-shot encounters and spontaneously used in pragmatic inference.

}, url = {https://escholarship.org/uc/item/8jx5h8sn}, author = {Hu, Jennifer and Zaslavsky, Noga and Levy, Roger} } @article {5054, title = {Compositional Networks Enable Systematic Generalization for Grounded Language Understanding}, year = {2021}, abstract = {

Humans are remarkably flexible when under- standing new sentences that include combinations of concepts they have never encountered before. Recent work has shown that while deep networks can mimic some human language abilities when presented with novel sentences, systematic variation un- covers the limitations in the language-understanding abilities of networks. We demonstrate that these limitations can be overcome by addressing the generalization challenges in the gSCAN dataset, which explicitly measures how well an agent is able to interpret novel linguistic commands grounded in vision, e.g., novel pairings of adjectives and nouns. The key principle we employ is compositionality: that the compositional structure of networks should reflect the compositional structure of the problem domain they address, while allowing other parameters to be learned end-to-end. We build a general-purpose mechanism that enables agents to generalize their language understanding to compositional domains. Crucially, our network has the same state-of-the art performance as prior work while generalizing its knowledge when prior work does not. Our network also provides a level of interpretability that enables users to inspect what each part of networks learns. Robust grounded language understanding without dramatic failures and without corner cases is critical to building safe and fair robots; we demonstrate the significant role that compositionality can play in achieving that goal.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5069, title = {Compositional RL Agents That Follow Language Commands in Temporal Logic}, journal = {Frontiers in Robotics and AI}, volume = {8}, year = {2021}, month = {07/2022}, abstract = {

We demonstrate how a reinforcement learning agent can use compositional recurrent neural networks to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to significantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm configurations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.

}, doi = {10.3389/frobt.2021.689550}, url = {https://www.frontiersin.org/articles/10.3389/frobt.2021.689550/full}, author = {Kuo, Yen-Ling and Katz, Boris and Andrei Barbu} } @article {5056, title = {Compositional RL Agents That Follow Language Commands in Temporal Logic}, year = {2021}, abstract = {

We demonstrate how a reinforcement learning agent can use compositional recurrent neural net- works to learn to carry out commands specified in linear temporal logic (LTL). Our approach takes as input an LTL formula, structures a deep network according to the parse of the formula, and determines satisfying actions. This compositional structure of the network enables zero-shot generalization to sig- nificantly more complex unseen formulas. We demonstrate this ability in multiple problem domains with both discrete and continuous state-action spaces. In a symbolic domain, the agent finds a sequence of letters that satisfy a specification. In a Minecraft-like environment, the agent finds a sequence of actions that conform to a formula. In the Fetch environment, the robot finds a sequence of arm config- urations that move blocks on a table to fulfill the commands. While most prior work can learn to execute one formula reliably, we develop a novel form of multi-task learning for RL agents that allows them to learn from a diverse set of tasks and generalize to a new set of diverse tasks without any additional training. The compositional structures presented here are not specific to LTL, thus opening the path to RL agents that perform zero-shot generalization in other compositional domains.

}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} } @article {5070, title = {Computational models of category-selective brain regions enable high-throughput tests of selectivity}, journal = {Nature Communications}, volume = {12}, year = {2021}, month = {12/2021}, abstract = {

Cortical regions apparently selective to faces, places, and bodies have provided important evidence for domain-specific theories of human cognition, development, and evolution. But claims of category selectivity are not quantitatively precise and remain vulnerable to empirical refutation. Here we develop artificial neural network-based encoding models that accurately predict the response to novel images in the fusiform face area, parahippocampal place area, and extrastriate body area, outperforming descriptive models and experts. We use these models to subject claims of category selectivity to strong\ tests, by screening for and synthesizing images predicted to produce high responses. We find that these high-response-predicted images are all unambiguous members of the hypothesized preferred category for each region. These results provide accurate, image-computable encoding models of each category-selective region, strengthen evidence for domain specificity in the brain, and point the way for future research characterizing the functional organization of the brain with unprecedented computational precision.

}, doi = {10.1038/s41467-021-25409-6}, url = {https://www.nature.com/articles/s41467-021-25409-6}, author = {N. Apurva Ratan Murty and Pouya Bashivan and Abate, Alex and James J. DiCarlo and Nancy Kanwisher} } @article {5071, title = {Confidence and central tendency in perceptual judgment}, journal = {Attention, Perception, \& Psychophysics}, volume = {83}, year = {2021}, month = {J10/2021}, pages = {3024 - 3034}, abstract = {

This paper theoretically and empirically investigates the role of noisy cognition in perceptual judgment, focusing on the central tendency effect: the well-known empirical regularity that perceptual judgments are biased towards the center of the stimulus distribution. Based on a formal Bayesian framework, we generate predictions about the relationships between subjective confidence, central tendency, and response variability. Specifically, our model clarifies that lower subjective confidence as a measure of posterior uncertainty about a judgment should predict (i) a lower sensitivity of magnitude estimates to objective stimuli; (ii) a higher sensitivity to the mean of the stimulus distribution; (iii) a stronger central tendency effect at higher stimulus magnitudes; and (iv) higher response variability. To test these predictions, we collect a large-scale experimental data set and additionally re-analyze perceptual judgment data from several previous experiments. Across data sets, subjective confidence is strongly predictive of the central tendency effect and response variability, both correlationally and when we exogenously manipulate the magnitude of sensory noise. Our results are consistent with (but not necessarily uniquely explained by) Bayesian models of confidence and the central tendency.

}, issn = {1943-3921}, doi = {10.3758/s13414-021-02300-6}, url = {https://link.springer.com/10.3758/s13414-021-02300-6}, author = {Xiang, Yang and Graeber, Thomas and Enke, Benjamin and Samuel J Gershman} } @article {4780, title = {Deep Learning for Seismic Inverse Problems: Toward the Acceleration of Geophysical Analysis Workflows}, journal = {IEEE Signal Processing Magazine}, volume = {38}, year = {2021}, month = {03/2021}, pages = {89 - 119}, abstract = {

Seismic inversion is a fundamental tool in geophysical analysis, providing a window into Earth. In particular, it enables the reconstruction of large-scale subsurface Earth models for hydrocarbon exploration, mining, earthquake analysis, shallow hazard assessment, and other geophysical tasks.

}, issn = {1053-5888}, doi = {10.1109/MSP.2020.3037429}, url = {https://ieeexplore.ieee.org/abstract/document/9363496}, author = {Amir Adler and Araya-Polo, Mauricio and Tomaso Poggio} } @article {4990, title = {Deep neural network models reveal interplay of peripheral coding and stimulus statistics in pitch perception}, journal = {Nature Communications}, volume = {12}, year = {2021}, month = {12/2021}, abstract = {

Perception is thought to be shaped by the environments for which organisms are optimized. These influences are difficult to test in biological organisms but may be revealed by machine perceptual systems optimized under different conditions. We investigated environmental and physiological influences on pitch perception, whose properties are commonly linked to peripheral neural coding limits. We first trained artificial neural networks to estimate fundamental frequency from biologically faithful cochlear representations of natural sounds. The best-performing networks replicated many characteristics of human pitch judgments. To probe the origins of these characteristics, we then optimized networks given altered cochleae or sound statistics. Human-like behavior emerged only when cochleae had high temporal fidelity and when models were optimized for naturalistic sounds. The results suggest pitch perception is critically shaped by the constraints of natural environments in addition to those of the cochlea, illustrating the use of artificial neural networks to reveal underpinnings of behavior.

}, doi = {10.1038/s41467-021-27366-6}, url = {https://www.nature.com/articles/s41467-021-27366-6}, author = {Saddler, Mark R. and Gonzalez, Ray and Josh H. McDermott} } @article {4752, title = {Distribution of Classification Margins: Are All Data Equal?}, year = {2021}, abstract = {

Recent theoretical results show that gradient descent on deep neural networks under exponential loss functions locally maximizes classification margin, which is equivalent to minimizing the norm of the weight matrices under margin\  constraints. This property of the solution however does not fully characterize the generalization performance. We motivate theoretically and show empirically that the area under the curve of the margin distribution on the training set is in fact a good measure of generalization. We then show that, after data separation is achieved, it is possible to dynamically reduce the training set by more than 99\% without significant loss of performance. Interestingly, the resulting subset of {\textquotedblleft}high capacity{\textquotedblright} features is not consistent across different training runs, which is consistent with the theoretical claim that all training points should converge to the same asymptotic margin under SGD and in the presence of both batch normalization and weight decay.

}, author = {Andrzej Banburski and Fernanda De La Torre and Nishka Pant and Ishana Shastri and Tomaso Poggio} } @article {4880, title = {Dynamics and Neural Collapse in Deep Classifiers trained with the Square Loss}, year = {2021}, abstract = {

We overview several properties -- old and new -- \ of training overparametrized deep networks under the square loss. We first consider a model of the dynamics of gradient flow under the square loss in deep homogeneous ReLU networks. We study the convergence to a solution with the absolute minimum $\rho$, which is the product of the Frobenius norms of each layer weight matrix, when normalization by \ Lagrange multipliers (LM) is used together with Weight Decay (WD) under different forms of gradient descent. A main property of the minimizers that bounds their expected error {\it for a specific network architecture} is $\rho$. In particular, we derive novel norm-based bounds for convolutional layers that are orders of magnitude better than classical bounds for dense networks. Next we prove that quasi-interpolating solutions obtained by Stochastic Gradient Descent (SGD) in the presence of WD have a bias towards low rank weight matrices -- that, as we also explain, should improve generalization. The same analysis predicts the existence of an inherent SGD noise for deep networks. In both cases, we verify our predictions experimentally. We then predict Neural Collapse and its properties without any specific assumption -- unlike other published proofs. Our analysis supports the idea that the advantage of deep networks relative to other classifiers is greater for the problems that are appropriate for sparse deep architectures such as CNNs. The deep reason compositionally sparse \ target functions \ can be approximated well by {\textquoteleft}{\textquoteleft}sparse{\textquoteright}{\textquoteright} deep networks without incurring in the curse of dimensionality.

}, author = {M. Xu and Akshay Rangamani and Andrzej Banburski and Q. Liao and Tomer Galanti and Tomaso Poggio} } @article {4769, title = {The Effects of Image Distribution and Task on Adversarial Robustness}, year = {2021}, month = {02/2021}, abstract = {

In this paper, we propose an adaptation to the area under the curve (AUC) metric to measure the adversarial robustness of a model over a particular ε-interval [ε0, ε1] (interval of adversarial perturbation strengths) that facilitates unbiased comparisons across models when they have different initial ε0 performance. This can be used to determine how adversarially robust a model is to different image distributions or task (or some other variable); and/or to measure how robust a model is comparatively to other models. We used this adversarial robustness metric on models of an MNIST, CIFAR-10, and a Fusion dataset (CIFAR-10 + MNIST) where trained models performed either a digit or object recognition task using a LeNet, ResNet50, or a fully connected network (FullyConnectedNet) architecture and found the following: 1) CIFAR-10 models are inherently less adversarially robust than MNIST models; 2) Both the image distribution and task that a model is trained on can affect the adversarial robustness of the resultant model. 3) Pretraining with a different image distribution and task sometimes carries over the adversarial robustness induced by that image distribution and task in the resultant model; Collectively, our results imply non-trivial differences of the learned representation space of one perceptual system over another given its exposure to different image statistics or tasks (mainly objects vs digits). Moreover, these results hold even when model systems are equalized to have the same level of performance, or when exposed to approximately matched image statistics of fusion images but with different tasks.

}, author = {Owen Kunhardt and Arturo Deza and Tomaso Poggio} } @book {4822, title = {Encyclopedia of Color Science and TechnologyBayesian Approaches to Color Category Learning}, year = {2021}, month = {01/2021}, pages = {1 - 5}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, address = {Berlin, Heidelberg}, isbn = {978-3-642-27851-8}, doi = {10.1007/978-3-642-27851-8}, url = {http://link.springer.com/10.1007/978-3-642-27851-8}, author = {Griffiths, Thomas L. and Zaslavsky, Noga}, editor = {Shamey, Renzo} } @article {5072, title = {Evaluating the Adversarial Robustness of a Foveated Texture Transform Module in a CNN}, year = {2021}, month = {12/2021}, url = {https://nips.cc/Conferences/2021/Schedule?showEvent=21868}, author = {Jonathan Gant and Andrzej Banburski and Arturo Deza and Tomaso Poggio} } @article {4843, title = {Evolutionary and biomedical insights from a marmoset diploid genome assembly}, journal = {Nature}, year = {2021}, month = {04/2021}, abstract = {

The accurate and complete assembly of both haplotype sequences of a diploid organism is essential to understanding the role of variation in genome functions, phenotypes, and diseases1. Here, using a trio-binning approach, we present a high-quality, diploid reference genome, with both haplotypes assembled independently at the chromosome level, for the common marmoset (Callithrix jacchus), an important primate model system widely used in biomedical research2,3. The full heterozygosity spectrum between the two haplotypes involves 1.36\% of the genome, much higher than the 0.13\% indicated by the standard single nucleotide heterozygosity estimation alone. The de novo mutation rate is 0.43 {\texttimes} 10-8 per site per generation, where the paternal inherited genome acquired twice as many mutations as the maternal. Our diploid assembly enabled us to discover a recent expansion of the sex differentiated region and unique evolutionary changes in the marmoset Y chromosome. Additionally, we identified many genes with signatures of positive selection that might have contributed to the evolution of Callithrix biological features. Brain related genes were highly conserved between marmosets and humans, though several genes experienced lineage-specific copy number variations or diversifying selection, providing important implications for the application of marmosets as a model system.

}, issn = {0028-0836}, doi = {10.1038/s41586-021-03535-x}, url = {http://www.nature.com/articles/s41586-021-03535-x}, author = {Yang, Chentao and Zhou, Yang and Marcus, Stephanie and Formenti, Giulio and Bergeron, Lucie A. and Song, Zhenzhen and Bi, Xupeng and Bergman, Juraj and Rousselle, Marjolaine Marie C. and Zhou, Chengran and Zhou, Long and Deng, Yuan and Fang, Miaoquan and Xie, Duo and Zhu, Yuanzhen and Tan, Shangjin and Mountcastle, Jacquelyn and Haase, Bettina and Balacco, Jennifer and Wood, Jonathan and Chow, William and Rhie, Arang and Pippel, Martin and Fabiszak, Margaret M. and Koren, Sergey and Fedrigo, Olivier and W. A. Freiwald and Howe, Kerstin and Yang, Huanming and Phillippy, Adam M. and Schierup, Mikkel Heide and Jarvis, Erich D. and Zhang, Guojie} } @article {4869, title = {A fast link between face perception and memory in the temporal pole}, journal = {Science}, year = {2021}, month = {07/2021}, pages = {eabi6671}, abstract = {

The question of how the brain recognizes the faces of familiar individuals has been important throughout the history of neuroscience. Cells linking visual processing to person memory have been proposed, but not found. Here we report the discovery of such cells through recordings from an fMRI-identified area in the macaque temporal pole. These cells responded to faces when they were personally familiar. They responded non-linearly to step-wise changes in face visibility and detail, and holistically to face parts, reflecting key signatures of familiar face recognition. They discriminated between familiar identities, as fast as a general face identity area. The discovery of these cells establishes a new pathway for the fast recognition of familiar individuals.

}, issn = {0036-8075}, doi = {10.1126/science.abi6671}, url = {https://www.sciencemag.org/lookup/doi/10.1126/science.abi6671}, author = {Landi, Sofia M. and Viswanathan, Pooja and Serene, Stephen and W. A. Freiwald} } @article {5074, title = {Fast Recurrent Processing via Ventrolateral Prefrontal Cortex Is Needed by the Primate Ventral Stream for Robust Core Visual Object Recognition}, journal = {Neuron}, volume = {109}, year = {2021}, month = {01/2021}, pages = {164 - 176.e5}, abstract = {

Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core object recognition require additional time to develop for specific, {\textquotedblleft}late-solved{\textquotedblright} images. This suggests the necessity of recurrent processing in these computations. Which brain circuits are responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventrolateral prefrontal cortex (vlPFC) is a critical recurrent node in this system, here, we pharmacologically inactivated parts of vlPFC and simultaneously measured IT activity while monkeys performed object discrimination tasks. vlPFC inactivation deteriorated the quality of late-phase (\>150\ ms from image onset) IT population code and produced commensurate behavioral deficits for late-solved images. Finally, silencing vlPFC caused the monkeys{\textquoteright} IT activity and behavior to become more like those produced by feedforward-only ventral stream models. Together with prior work, these results implicate fast recurrent processing through vlPFC as critical to producing behaviorally sufficient object representations in IT.

}, issn = {08966273}, doi = {10.1016/j.neuron.2020.09.035}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627320307595}, author = {Kar, Kohitij and James J. DiCarlo} } @article {4789, title = {Flexible modulation of sequence generation in the entorhinal-hippocampal system}, journal = {Nature Neuroscience}, year = {2021}, month = {04/2021}, abstract = {

Exploration, consolidation and planning depend on the generation of sequential state representations. However, these algorithms require disparate forms of sampling dynamics for optimal performance. We theorize how the brain should adapt internally generated sequences for particular cognitive functions and propose a neural mechanism by which this may be accomplished within the entorhinal{\textendash}hippocampal circuit. Specifically, we demonstrate that the systematic modulation along the medial entorhinal cortex dorsoventral axis of grid population input into the hippocampus facilitates a flexible generative process that can interpolate between qualitatively distinct regimes of sequential hippocampal reactivations. By relating the emergent hippocampal activity patterns drawn from our model to empirical data, we explain and reconcile a diversity of recently observed, but apparently unrelated, phenomena such as generative cycling, diffusive hippocampal reactivations and jumping trajectory events.

}, doi = {10.1038/s41593-021-00831-7}, url = {https://www.nature.com/articles/s41593-021-00831-7}, author = {McNamee, D. and Stachenfeld, K. and Botvinick, M.M. and Samuel J Gershman} } @conference {5045, title = {Frivolous Units: Wider Networks Are Not Really That Wide}, booktitle = {AAAI 2021}, year = {2021}, month = {05/2021}, abstract = {

A remarkable characteristic of overparameterized deep neural networks (DNNs) is that their accuracy does not degrade when the network width is increased. Recent evidence suggests that developing compressible representations allows the complex- ity of large networks to be adjusted for the learning task at hand. However, these representations are poorly understood. A promising strand of research inspired from biology involves studying representations at the unit level as it offers a more granular interpretation of the neural mechanisms. In order to better understand what facilitates increases in width without decreases in accuracy, we ask: Are there mechanisms at the unit level by which networks control their effective complex- ity? If so, how do these depend on the architecture, dataset, and hyperparameters? We identify two distinct types of {\textquotedblleft}frivolous{\textquotedblright} units that prolifer- ate when the network{\textquoteright}s width increases: prunable units which can be dropped out of the network without significant change to the output and redundant units whose activities can be ex- pressed as a linear combination of others. These units imply complexity constraints as the function the network computes could be expressed without them. We also identify how the development of these units can be influenced by architecture and a number of training factors. Together, these results help to explain why the accuracy of DNNs does not degrade when width is increased and highlight the importance of frivolous units toward understanding implicit regularization in DNNs.

}, url = {https://dblp.org/rec/conf/aaai/CasperBDGSVK21.html}, author = {Stephen Casper and Xavier Boix and Vanessa D{\textquoteright}Amario and Ling Guo and Martin Schrimpf and Vinken, Kasper and Gabriel Kreiman} } @article {4734, title = {From Associative Memories to Powerful Machines}, year = {2021}, month = {01/2021}, abstract = {

Associative memories were implemented as simple networks of\ threshold neurons by Willshaw and Longuet-Higgins in the\ {\textquoteright}60s. Today{\textquoteright}s deep networks are quite similar: they can be\ regarded as approximating look-up tables, similar to Gaussian\ RBF networks. Thinking about deep networks as large\ associative memories provides a more realistic and sober\ perspective on the promises of deep learning.
Such associative networks are not powerful enough to\  account for intelligent abilities such as language or\ logic. Could evolution have discovered how to go beyond\ simple reflexes and associative memories? \ I will discuss how inventions such as recurrence and hidden states can transform look-up tables in powerful computing machines. In a July 2022 update I\ outline a theory framework explaining\ how deep networks may work, including\ transformers. The framework is based on proven results plus\ a couple of conjectures -- still open.

}, author = {Tomaso Poggio} } @article {4936, title = {From Marr{\textquoteright}s Vision to the Problem of Human Intelligence}, number = {118}, year = {2021}, month = {09/2021}, author = {Tomaso Poggio} } @conference {4821, title = {A Geometric Analysis of Deep Generative Image Models and Its Applications}, booktitle = {Proc. International Conference on Learning Representations, 2021}, year = {2021}, month = {01/2021}, abstract = {

Generative adversarial networks (GANs) have emerged as a powerful unsupervised method to model the statistical patterns of real-world data sets, such as natural images. These networks are trained to map random inputs in their latent space to new samples representative of the learned data. However, the structure of the latent space is hard to intuit due to its high dimensionality and the non-linearity of the generator, which limits the usefulness of the models. Understanding the latent space requires a way to identify input codes for existing real-world images (inversion), and a way to identify directions with known image transformations (interpretability). Here, we use a geometric framework to address both issues simultaneously. We develop an architecture-agnostic method to compute the Riemannian metric of the image manifold created by GANs. The eigen-decomposition of the metric isolates axes that account for different levels of image variability. An empirical analysis of several pretrained GANs shows that image variation around each position is concentrated along surprisingly few major axes (the space is highly anisotropic) and the directions that create this large variation are similar at different positions in the space (the space is homogeneous). We show that many of the top eigenvectors correspond to interpretable transforms in the image space, with a substantial part of eigenspace corresponding to minor transforms which could be compressed out. This geometric understanding unifies key previous results related to GAN interpretability. We show that the use of this metric allows for more efficient optimization in the latent space (e.g. GAN inversion) and facilitates unsupervised discovery of interpretable axes. Our results illustrate that defining the geometry of the GAN image manifold can serve as a general framework for understanding GANs.

}, author = {Binxu Wang and Carlos R Ponce} } @article {4831, title = {The human endogenous attentional control network includes a ventro-temporal cortical node}, journal = {Nature Communications}, volume = {12}, year = {2021}, month = {02/2021}, abstract = {

Endogenous attention is the cognitive function that selects the relevant pieces of sensory information to achieve goals and it is known to be controlled by dorsal fronto-parietal brain areas. Here we expand this notion by identifying a control attention area located in the temporal lobe. By combining a demanding behavioral paradigm with functional neuroimaging and diffusion tractography, we show that like fronto-parietal attentional areas, the human posterior inferotemporal cortex exhibits significant attentional modulatory activity. This area is functionally distinct from surrounding cortical areas, and is directly connected to parietal and frontal attentional regions. These results show that attentional control spans three cortical lobes and overarches large distances through fiber pathways that run orthogonally to the dominant anterior-posterior axes of sensory processing, thus suggesting a different organizing principle for cognitive control.

}, doi = {10.1038/s41467-020-20583-5}, url = {http://www.nature.com/articles/s41467-020-20583-5}, author = {Sani, Ilaria and Stemmann, Heiko and Caron, Bradley and Bullock, Daniel and Stemmler, Torsten and Fahle, Manfred and Pestilli, Franco and W. A. Freiwald} } @article {4824, title = {Human visual motion perception shows hallmarks of Bayesian structural inference}, journal = {Scientific Reports}, volume = {11}, year = {2021}, month = {02/2021}, abstract = {

Motion relations in visual scenes carry an abundance of behaviorally relevant information, but little is known about how humans identify the structure underlying a scene{\textquoteright}s motion in the first place. We studied the computations governing human motion structure identification in two psychophysics experiments and found that perception of motion relations showed hallmarks of Bayesian structural inference. At the heart of our research lies a tractable task design that enabled us to reveal the signatures of probabilistic reasoning about latent structure. We found that a choice model based on the task{\textquoteright}s Bayesian ideal observer accurately matched many facets of human structural inference, including task performance, perceptual error patterns, single-trial responses, participant-specific differences, and subjective decision confidence{\textemdash}especially, when motion scenes were ambiguous and when object motion was hierarchically nested within other moving reference frames. Our work can guide future neuroscience experiments to reveal the neural mechanisms underlying higher-level visual motion perception.

}, doi = {10.1038/s41598-021-82175-7}, url = {http://www.nature.com/articles/s41598-021-82175-7}, author = {Yang, Sichao and Bill, Johannes and Drugowitsch, Jan and Samuel J Gershman} } @article {5044, title = {Hypothesis-driven Online Video Stream Learning with Augmented Memory}, journal = {arXiv}, year = {2021}, month = {04/2021}, abstract = {

The ability to continuously acquire new knowledge without forgetting previous tasks remains a challenging problem for computer vision systems. Standard continual learning benchmarks focus on learning from static iid images in an offline setting. Here, we examine a more challenging and realistic online continual learning problem called online stream learning. Like humans, some AI agents have to learn incrementally from a continuous temporal stream of non-repeating data. We propose a novel model, Hypotheses-driven Augmented Memory Network (HAMN), which efficiently consolidates previous knowledge using an augmented memory matrix of "hypotheses" and replays reconstructed image features to avoid catastrophic forgetting. Compared with pixel-level and generative replay approaches, the advantages of HAMN are two-fold. First, hypothesis-based knowledge consolidation avoids redundant information in the image pixel space and makes memory usage far more efficient. Second, hypotheses in the augmented memory can be re-used for learning new tasks, improving generalization and transfer learning ability. Given a lack of online incremental class learning datasets on video streams, we introduce and adapt two additional video datasets, Toybox and iLab, for online stream learning. We also evaluate our method on the CORe50 and online CIFAR100 datasets. Our method performs significantly better than all state-of-the-art methods, while offering much more efficient memory usage. All source code and data are publicly available at this URL

}, doi = {10.48550/arXiv.2104.02206}, url = {https://arxiv.org/abs/2104.02206}, author = {Mengmi Zhang and Rohil Badkundri and Morgan B. Talbot and Rushikesh Zawar and Gabriel Kreiman} } @article {5009, title = {Image interpretation by iterative bottom-up top- down processing}, number = {120}, year = {2021}, month = {11/2021}, abstract = {

Scene understanding requires the extraction and representation of scene components, such as objects and their parts, people, and places, together with their individual properties, as well as relations and interactions between them. We describe a model in which meaningful scene structures are extracted from the image by an iterative process, combining bottom-up (BU) and top-down (TD) networks, interacting through a symmetric bi-directional communication between them ({\textquoteleft}counter-streams{\textquoteright} structure). The BU- TD model extracts and recognizes scene constituents with their selected properties and relations, and uses them to describe and understand the image.

The scene representation is constructed by the iterative use of three components. The first model component is a bottom-up stream that extracts selected scene elements, properties and relations. The second component ({\textquoteleft}cognitive augmentation{\textquoteright}) augments the extracted visual representation based on relevant non-visual stored representations. It also provides input to the third component, the top-down stream, in the form of a TD instruction, instructing the model what task to perform next. The top-down stream then guides the BU visual stream to perform the selected task in the next cycle. During this

process, the visual representations extracted from the image can be combined with relevant non- visual representations, so that the final scene representation is based on both visual information extracted from the scene and relevant stored knowledge of the world.
We show how the BU-TD model composes complex visual tasks from sequences of steps, invoked by individual TD instructions. In particular, we describe how a sequence of TD-instructions is used to extract from the scene structures of interest, including an algorithm to automatically select the next TD- instruction in the sequence. The selection of TD instruction depends in general on the goal, the image, and on information already extracted from the image in previous steps. The TD-instructions sequence is therefore not a fixed sequence determined at the start, but an evolving program (or {\textquoteleft}visual routine{\textquoteright}) that depends on the goal and the image.

The extraction process is shown to have favourable properties in terms of combinatorial generalization,

generalizing well to novel scene structures and new combinations of objects, properties and relations not seen during training. Finally, we compare the model with relevant aspects of the human vision, and suggest directions for using the BU-TD scheme for integrating visual and cognitive components in the process of scene understanding.

}, author = {Shimon Ullman and Liav Assif and Alona Strugatski and Ben-Zion Vatashsky and Hila Levi and Aviv Netanyahu and Adam Uri Yaari} } @article {5075, title = {Joint encoding of facial identity, orientation, gaze, and expression in the middle dorsal face areaSignificance}, journal = {Proceedings of the National Academy of Sciences}, volume = {118}, year = {2021}, month = {04/2021}, abstract = {

The last two decades have established that a network of face-selective areas in the temporal lobe of macaque monkeys supports the visual processing of faces. Each area within the network contains a large fraction of face-selective cells. And each area encodes facial identity and head orientation differently. A recent brain-imaging study discovered an area outside of this network selective for naturalistic facial motion, the middle dorsal (MD) face area. This finding offers the opportunity to determine whether coding principles revealed inside the core network would generalize to face areas outside the core network. We investigated the encoding of static faces and objects, facial identity, and head orientation, dimensions which had been studied in multiple areas of the core face-processing network before, as well as facial expressions and gaze. We found that MD populations form a face-selective cluster with a degree of selectivity comparable to that of areas in the core face-processing network. MD encodes facial identity robustly across changes in head orientation and expression, it encodes head orientation robustly against changes in identity and expression, and it encodes expression robustly across changes in identity and head orientation. These three dimensions are encoded in a separable manner. Furthermore, MD also encodes the direction of gaze in addition to head orientation. Thus, MD encodes both structural properties (identity) and changeable ones (expression and gaze) and thus provides information about another animal{\textquoteright}s direction of attention (head orientation and gaze). MD contains a heterogeneous population of cells that establish a multidimensional code for faces.

}, issn = {0027-8424}, doi = {10.1073/pnas.2108283118}, url = {https://pnas.org/doi/full/10.1073/pnas.2108283118}, author = {Yang, Zetian and W. A. Freiwald} } @article {4825, title = {Large-scale benchmarking of deep neural network models in mouse visual cortex reveals patterns similar to those observed in macaque visual cortex}, year = {2021}, abstract = {

What is the representational structure of mouse visual cortex and how is it shaped? Mice obviouslyinteract with the world and recognize objects but unlike in primates, a majority of research to date suggests theactivity of their visual cortex may not be so well described by deep neural networks trained for object recognition.Using the Allen Brain Observatory{\quotesinglbase}{\"A}{\^o}s 2-photon calcium-imaging dataset of activity in over 30,000 rodent visualcortical neurons recorded in response to natural scenes, we work to resolve this discrepancy and demonstrate thatmodern neural networks can indeed be used to explain activity in the mouse visual cortex to a more reasonabledegree than previously suggested. In so doing, we elucidate at large scale the properties of networks whichbest match the biological visual system, with both representational similarity analysis and encoding modelscoming to mostly the same conclusions. Our analysis of 30 object recognition architectures (both pretrainedand randomly initialized) from the PyTorch model zoo demonstrates that deeper, thinner residual networks withbypass connections, fewer parameters shared across many convolutions, and higher scores on the ImageNetimage-recognition challenge tend to be more predictive of the neural activations in our sample. Additionally, wefind a significant degree of overlap between the models that best predict macaque visual cortex (as cataloguedby brain-score.org) and those that best predict mouse visual cortex. In concert, these findings help to bolster themouse brain as a viable source of data for the methods that have been successful thus far in the study of monkeybrains, and provide a preliminary set of design targets for building models that can better take advantage of theunparalleled scale, quality, and resolution of data afforded by calcium-imaging in the mouse brain.

}, author = {Colin Conwell and David Mayo and Michael Buice and Boris Katz and George Alvarez and Andrei Barbu} } @proceedings {4927, title = {Let{\textquoteright}s talk (efficiently) about us: Person systems achieve near-optimal compression}, volume = {43}, year = {2021}, month = {07/2021}, abstract = {

Systems of personal pronouns (e.g.,{\textquoteleft}you{\textquoteright} and {\textquoteleft}I{\textquoteright}) vary widely across languages, but at the same time not all possible systems are attested. Linguistic theories have generally accounted for this in terms of strong grammatical constraints, but recent experimental work challenges this view. Here, we take a novel approach to understanding personal pronoun systems by invoking a recent information-theoretic framework for semantic systems that predicts that languages efficiently compress meanings into forms. We find that a test set of cross-linguistically attested personal pronoun systems achieves near-optimal compression, supporting the hypothesis that efficient compression shapes semantic systems. Further, our best-fitting model includes an egocentric bias that favors a salient speaker representation, accounting for a well-known typological generalization of person systems ({\textquoteleft}Zwicky{\textquoteright}s Generalization{\textquoteright}) without the need for a hard grammatical constraint.

}, url = {https://escholarship.org/uc/item/2sj4t8m3}, author = {Zaslavsky, Noga and Maldonado, Mora and Culbertson, Jennifer} } @article {4497, title = {Leveraging facial expressions and contextual information to investigate opaque representations of emotions.}, journal = {Emotion}, year = {2021}, month = {02/2021}, abstract = {

Observers attribute emotions to others relying on multiple cues, including facial expressions and information about the situation. Recent research has used Bayesian models to study how these cues are integrated. Existing studies have used a variety of tasks to probe emotion inferences, but limited attention has been devoted to the possibility that different decision processes might be involved depending on the task. If this is the case, understanding emotion representations might require understanding the decision processes through which they give rise to judgments. This article 1) shows that the different tasks that have been used in the literature yield very different results, 2) proposes an account of the decision processes involved that explain the differences, and 3) tests novel predictions of this account. The results offer new insights into how emotions are represented, and more broadly demonstrate the importance of taking decision processes into account in Bayesian models of cognition.

}, issn = {1528-3542}, doi = {10.1037/emo0000685}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/emo0000685}, author = {Stefano Anzellottti and Sean Dae Houlihan and Samuel Liburd Jr and Rebecca Saxe} } @article {5043, title = {Localized task-invariant emotional valence encoding revealed by intracranial recordingsAbstract}, journal = {Social Cognitive and Affective Neuroscience}, year = {2021}, month = {12/2022}, abstract = {

The ability to distinguish between negative, positive and neutral valence is a key part of emotion perception. Emotional valence has conceptual meaning that supersedes any particular type of stimulus, although it is typically captured experimentally in association with particular tasks. We sought to identify neural encoding for task-invariant emotional valence. We evaluated whether high gamma responses (HGRs) to visually displayed words conveying emotions could be used to decode emotional valence from HGRs to facial expressions. Intracranial electroencephalography (iEEG) was recorded from fourteen individuals while they participated in two tasks, one involving reading words with positive, negative, and neutral valence, and the other involving viewing faces with positive, negative, and neutral facial expressions. Quadratic discriminant analysis was used to identify information in the HGR that differentiates the three emotion conditions. A classifier was trained on the emotional valence labels from one task and was cross-validated on data from the same task (within-task classifier) as well as the other task (between-task classifier). Emotional valence could be decoded in the left medial orbitofrontal cortex and middle temporal gyrus, both using within-task classifiers as well as between-task classifiers. These observations suggest the presence of task-independent emotional valence information in the signals from these regions.

}, keywords = {classifier, decoding, emotion, intracranial EEG, valence}, issn = {1749-5016}, doi = {10.1093/scan/nsab134}, url = {https://academic.oup.com/scan/advance-article/doi/10.1093/scan/nsab134/6481890}, author = {Weisholtz, Daniel S and Gabriel Kreiman and Silbersweig, David A and Stern, Emily and Cha, Brannon and Butler, Tracy} } @article {4826, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. \&nbsp;Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. \&nbsp;This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. \&nbsp;We introduce the space of generalizations (Grounded-WEAT and Grounded-SEAT) and demonstrate that three generalizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting standard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. \&nbsp;Dataset construction is challenging because vision datasets are themselves very biased. \&nbsp;The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Boris Katz and Andrei Barbu} } @article {5057, title = {Measuring Social Biases in Grounded Vision and Language Embeddings}, year = {2021}, abstract = {

We generalize the notion of measuring social biases in word embeddings to visually grounded word embeddings. Biases are present in grounded embeddings, and indeed seem to be equally or more significant than for ungrounded embeddings. This is despite the fact that vision and language can suffer from different biases, which one might hope could attenuate the biases in both. Multiple ways exist to generalize metrics measuring bias in word embeddings to this new setting. We introduce the space of generalizations (GroundedWEAT and Grounded-SEAT) and demonstrate that three gener- alizations answer different yet important questions about how biases, language, and vision interact. These metrics are used on a new dataset, the first for grounded bias, created by augmenting stan- dard linguistic bias benchmarks with 10,228 images from COCO, Conceptual Captions, and Google Images. Dataset construction is challenging because vision datasets are themselves very biased. The presence of these biases in systems will begin to have real-world consequences as they are deployed, making carefully measuring bias and then mitigating it critical to building a fair society.

}, author = {Candace Ross and Andrei Barbu and Boris Katz} } @article {4786, title = {Memory as a Computational Resource}, journal = {Trends in Cognitive Sciences}, volume = {25}, year = {2021}, month = {03/2021}, pages = {240 - 251}, abstract = {

Most computations that people do in everyday life are very expensive. Recent research highlights that humans make efficient use of their limited computational resources to tackle these problems. Memory is a crucial aspect of algorithmic efficiency and permits the reuse of past computation through memoization. We review neural and behavioral evidence of humans reusing past computations across several domains, including mental imagery, arithmetic, planning, and probabilistic inference. Recent developments in neural networks expand the scope of computational reuse with a distributed form of memoization called amortization. This opens many new avenues of research. Computer scientists have long recognized that naive implementations of algorithms often result in a paralyzing degree of redundant computation. More sophisticated implementations harness the power of memory by storing computational results and reusing them later. We review the application of these ideas to cognitive science, in four case studies (mental arithmetic, mental imagery, planning, and probabilistic inference). Despite their superficial differences, these cognitive processes share a common reliance on memory that enables efficient computation.

}, keywords = {amortization, inference, memory, mental arithmetic, mental imagery, planning}, issn = {13646613}, doi = {10.1016/j.tics.2020.12.008}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661320303053}, author = {Ishita Dasgupta and Samuel J Gershman} } @article {5040, title = {Mesoscopic physiological interactions in the human brain reveal small-world properties}, journal = {Cell Reports}, volume = {36}, year = {2021}, month = {08/2021}, pages = {109585}, abstract = {

Cognition depends on rapid and robust communication between neural circuits spanning different brain areas. We investigated the mesoscopic network of cortico-cortical interactions in the human brain in an extensive dataset consisting of 6,024\ h of intracranial field potential recordings from 4,142 electrodes in 48 subjects. We evaluated communication between brain areas at the network level across different frequency bands. The interaction networks were validated against known anatomical measurements and neurophysiological interactions in humans and monkeys. The resulting human brain interactome is characterized by a broad and spatially specific, dynamic, and extensive network. The physiological interactome reveals small-world properties, which we conjecture might facilitate efficient and reliable information transmission. The interaction dynamics correlate with the brain sleep/awake state. These results constitute initial steps toward understanding how the interactome orchestrates cortical communication and provide a reference for future efforts assessing how dysfunctional interactions may lead to mental disorders.

}, issn = {22111247}, doi = {10.1016/j.celrep.2021.109585}, url = {https://linkinghub.elsevier.com/retrieve/pii/S2211124721010196}, author = {Wang, Jiarui and Tao, Annabelle and Anderson, William S. and Madsen, Joseph R. and Gabriel Kreiman} } @article {5077, title = {Meta-strategy learning in physical problem solving: the effect of embodied experience}, journal = {bioRxiv}, year = {2021}, month = {08/2021}, abstract = {

Embodied cognition suggests that our experience in our bodies -- including our motor experiences -- shape our cognitive and perceptual capabilities broadly. Much work has studied how differences in the physical body (either natural or manipulated) can impact peoples cognitive and perceptual capacities, but often these judgments relate directly to those body differences. Here we focus instead on how natural embodied experience affects what kinds of abstract physical problem-solving strategies people use in a virtual task. We compare how groups with different embodied experience -- children and adults with congenital limb differences versus those born with two hands -- perform on this task, and find that while there is no difference in overall accuracy or time to complete the task, the groups use different meta-strategies to come to solutions. Specifically, both children and adults born with limb differences take a longer time to think before acting, and as a result take fewer overall actions to reach solutions to physical reasoning problems. Conversely, the process of development affects the particular actions children use as they age regardless of how many hands they were born with, as well as their persistence with their current strategy. Taken together, our findings suggest that differences in embodied experience drive the acquisition of different meta-strategies for balancing acting with thinking, deciding what kinds of actions to try, and deciding how persistent to be with a current action plan.

}, author = {Kelsey Allen and Kevin A Smith and Laura Bird and Joshua B. Tenenbaum and Tamar Makin and Dorothy Cowie} } @article {5078, title = {Moral dynamics: Grounding moral judgment in intuitive physics and intuitive psychology}, journal = {Cognition}, volume = {217}, year = {2021}, month = {05/2021}, pages = {104890}, abstract = {

When holding others morally responsible, we care about what they did, and what they thought. Traditionally, research in moral psychology has relied on vignette studies, in which a protagonist{\textquoteright}s actions and thoughts are explicitly communicated. While this research has revealed what variables are important for moral judgment, such as actions and intentions, it is limited in providing a more detailed understanding of exactly how these variables affect moral judgment. Using dynamic visual stimuli that allow for a more fine-grained experimental control, recent studies have proposed a direct mapping from visual features to moral judgments. We embrace the use of visual stimuli in moral psychology, but question the plausibility of a feature-based theory of moral judgment. We propose that the connection from visual features to moral judgments is mediated by an inference about what the observed action reveals about the agent{\textquoteright}s mental states, and what causal role the agent{\textquoteright}s action played in bringing about the outcome. We present a computational model that formalizes moral judgments of agents in visual scenes as computations over an intuitive theory of physics combined with an intuitive theory of mind. We test the model{\textquoteright}s quantitative predictions in three experiments across a wide variety of dynamic interactions between agent and patient.

}, issn = {00100277}, doi = {10.1016/j.cognition.2021.104890}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027721003139}, author = {Sosa, Felix A. and Ullman, Tomer and Joshua B. Tenenbaum and Samuel J Gershman and Gerstenberg, Tobias} } @conference {4827, title = {Multi-resolution modeling of a discrete stochastic process identifies causes of cancer}, booktitle = {International Conference on Learning Representations}, year = {2021}, month = {09/2020}, abstract = {

Detection of cancer-causing mutations within the vast and mostly unexplored human genome is a major challenge. Doing so requires modeling the background mutation rate, a highly non-stationary stochastic process, across regions of interest varying in size from one to millions of positions. Here, we present the split-Poisson-Gamma (SPG) distribution, an extension of the classical Poisson-Gamma formulation, to model a discrete stochastic process at multiple resolutions. We demonstrate that the probability model has a closed-form posterior, enabling efficient and accurate linear-time prediction over any length scale after the parameters of the model have been inferred a single time. We apply our framework to model mutation rates in tumors and show that model parameters can be accurately inferred from high-dimensional epigenetic data using a convolutional neural network, Gaussian process, and maximum-likelihood estimation. Our method is both more accurate and more efficient than existing models over a large range of length scales. We demonstrate the usefulness of multi-resolution modeling by detecting genomic elements that drive tumor emergence and are of vastly differing sizes.

}, url = {https://openreview.net/forum?id=KtH8W3S_RE}, author = {Adam Uri Yaari and Maxwell Sherman and Oliver Clarke Priebe and Po-Ru Loh and Boris Katz and Andrei Barbu and Bonnie Berger} } @article {5079, title = {Multi-scale hierarchical neural network models that bridge from single neurons in the primate primary visual cortex to object recognition behavior}, journal = {bioRxiv}, year = {2021}, month = {08/2021}, abstract = {

Primate visual object recognition relies on the representations in cortical areas at the top of the ventral stream that are computed by a complex, hierarchical network of neural populations. While recent work has created reasonably accurate image-computable hierarchical neural network models of those neural stages, those models do not yet bridge between the properties of individual neurons and the overall emergent behavior of the ventral stream. One reason we cannot yet do this is that individual artificial neurons in multi-stage models have not been shown to be functionally similar to individual biological neurons. Here, we took an important first step by building and evaluating hundreds of hierarchical neural network models in how well their artificial single neurons approximate macaque primary visual cortical (V1) neurons. We found that single neurons in certain models are surprisingly similar to their biological counterparts and that the distributions of single neuron properties, such as those related to orientation and spatial frequency tuning, approximately match those in macaque V1. Critically, we observed that hierarchical models with V1 stages that better match macaque V1 at the single neuron level are also more aligned with human object recognition behavior. Finally, we show that an optimized classical neuroscientific model of V1 is more functionally similar to primate V1 than all of the tested multi-stage models, suggesting room for further model improvements with tangible payoffs in closer alignment to human behavior. These results provide the first multi-stage, multi-scale models that allow our field to ask precisely how the specific properties of individual V1 neurons relate to recognition behavior.

}, author = {Tiago Marques and Martin Schrimpf and James J. DiCarlo} } @article {4787, title = {Multi-task reinforcement learning in humans}, journal = {Nature Human Behaviour}, year = {2021}, month = {01/2021}, abstract = {

The ability to transfer knowledge across tasks and generalize to novel ones is an important hallmark of human intelligence. Yet not much is known about human multitask reinforcement learning. We study participants{\textquoteright} behaviour in a two-step decision-making task with multiple features and changing reward functions. We compare their behaviour with two algorithms for multitask reinforcement learning, one that maps previous policies and encountered features to new reward functions and one that approximates value functions across tasks, as well as to standard model-based and model-free algorithms. Across three exploratory experiments and a large preregistered confirmatory experiment, our results provide evidence that partici-pants who are able to learn the task use a strategy that maps previously learned policies to novel scenarios. These results enrich our understanding of human reinforcement learning in complex environments with changing task demands.

}, doi = {10.1038/s41562-020-01035-y}, url = {http://www.nature.com/articles/s41562-020-01035-y}, author = {Tomov, Momchil S. and Eric Schulz and Samuel J Gershman} } @article {4969, title = {The neural architecture of language: Integrative modeling converges on predictive processing}, journal = {Proceedings of the National Academy of Sciences}, volume = {118}, year = {2021}, month = {11/2021}, pages = {e2105646118}, abstract = {

Significance

Language is a quintessentially human ability. Research has long probed the functional architecture of language in the mind and brain using diverse neuroimaging, behavioral, and computational modeling approaches. However, adequate neurally-mechanistic accounts of how meaning might be extracted from language are sorely lacking. Here, we report a first step toward addressing this gap by connecting recent artificial neural networks from machine learning to human recordings during language processing. We find that the most powerful models predict neural and behavioral responses across different datasets up to noise levels. Models that perform better at predicting the next word in a sequence also better predict brain measurements{\textemdash}providing computationally explicit evidence that predictive processing fundamentally shapes the language comprehension mechanisms in the brain.

Abstract

The neuroscience of perception has recently been revolutionized with an integrative modeling approach in which computation, brain function, and behavior are linked across many datasets and many computational models. By revealing trends across models, this approach yields novel insights into cognitive and neural mechanisms in the target domain. We here present a systematic study taking this approach to higher-level cognition: human language processing, our species{\textquoteright} signature cognitive skill. We find that the most powerful {\textquotedblleft}transformer{\textquotedblright} models predict nearly 100\% of explainable variance in neural responses to sentences and generalize across different datasets and imaging modalities (functional MRI and electrocorticography). Models{\textquoteright} neural fits ({\textquotedblleft}brain score{\textquotedblright}) and fits to behavioral responses are both strongly correlated with model accuracy on the next-word prediction task (but not other language tasks). Model architecture appears to substantially contribute to neural fit. These results provide computationally explicit evidence that predictive processing fundamentally shapes the language comprehension mechanisms in the human brain.

}, issn = {0027-8424}, doi = {10.1073/pnas.2105646118}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2105646118}, author = {Martin Schrimpf and Blank, Idan Asher and Tuckute, Greta and Kauf, Carina and Hosseini, Eghbal A. and Nancy Kanwisher and Joshua B. Tenenbaum and Fedorenko, Evelina} } @book {5076, title = {The Neural Basis of Mentalizing: Linking Models of Theory of Mind and Measures of Human Brain Activity}, year = {2021}, month = {05/2021}, pages = {209 - 235}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, isbn = {978-3-030-51889-9}, doi = {10.1007/978-3-030-51890-510.1007/978-3-030-51890-5_11}, url = {https://link.springer.com/10.1007/978-3-030-51890-5}, author = {Sean Dae Houlihan and Joshua B. Tenenbaum and Rebecca Saxe}, editor = {Gilead, Michael and Ochsner, Kevin N.} } @article {5052, title = {Neural Regression, Representational Similarity, Model Zoology Neural Taskonomy at Scale in Rodent Visual Cortex}, year = {2021}, abstract = {

How well do deep neural networks fare as models of mouse visual cortex? A majority of research to date suggests results far more mixed than those produced in the modeling of primate visual cortex. Here, we perform a large-scale bench- marking of dozens of deep neural network models in mouse visual cortex with both representational similarity analysis and neural regression. Using the Allen Brain Observatory{\textquoteright}s 2-photon calcium-imaging dataset of activity in over 6,000 reliable rodent visual cortical neurons recorded in response to natural scenes, we replicate previous findings and resolve previous discrepancies, ultimately demonstrating that modern neural networks can in fact be used to explain activity in the mouse visual cortex to a more reasonable degree than previously suggested. Using our benchmark as an atlas, we offer preliminary answers to overarching questions about levels of analysis, questions about the properties of models that best predict the visual system overall and questions about the mapping between biological and artificial representations. Our results provide a reference point for future ventures in the deep neural network modeling of mouse visual cortex, hinting at novel combinations of mapping method, architecture, and task to more fully characterize the computational motifs of visual representation in a species so central to neuroscience, but with a perceptual physiology and ecology markedly different from the ones we study in primates.

}, author = {Colin Conwell and David Mayo and Michael A. Buice and Boris Katz and George A. Alvarez and Andrei Barbu} } @article {4828, title = {Neuroscience: A Face{\textquoteright}s Journey through Space and Time}, journal = {Current Biology}, volume = {31}, year = {2021}, month = {01/2021}, pages = {R13 - R15}, abstract = {

Faces are complex objects of great variety, which the visual brain somehow manages to organize by similarity. Two such orderings in fact exist and one, a new study finds, is transformed into the other over time, enhancing a face{\textquoteright}s distinctiveness.

}, issn = {09609822}, doi = {10.1016/j.cub.2020.10.065}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0960982220316080}, author = {W. A. Freiwald and Hosoya, Haruo} } @article {4829, title = {No evidence for prolactin{\textquoteright}s involvement in the post-ejaculatory refractory periodAbstract}, journal = {Communications Biology}, volume = {4}, year = {2021}, month = {01/2021}, doi = {10.1038/s42003-020-01570-4}, url = {http://www.nature.com/articles/s42003-020-01570-4}, author = {Valente, Susana and Marques, Tiago and Lima, Susana Q.} } @article {5080, title = {Partial Mental Simulation Explains Fallacies in Physical Reasoning}, journal = {psyArXiv}, year = {2021}, month = {11/2021}, abstract = {

People can reason intuitively, efficiently, and accurately about everyday physical events. Recent accounts suggest that people use mental simulation to make such intuitive physical judgments. But mental simulation models are computationally expensive; how is physical reasoning relatively accurate, while maintaining computational tractability? We suggest that people make use of partial simulation, mentally moving forward in time only parts of the world deemed relevant. We propose a novel partial simulation model, and test it on the physical conjunction fallacy, a recently observed phenomenon (Ludwin-Peery, Bramley, Davis, \& Gureckis, 2020) that poses a challenge for full simulation models. We find an excellent fit between our model{\textquoteright}s predictions and human performance on a set of scenarios that build on and extend those used by Ludwin-Peery et al. (2020), quantitatively and qualitatively accounting for a deviation from optimal performance. Our results suggest more generally how we allocate cognitive resources to efficiently represent and simulate physical scenes.

}, url = {https://psyarxiv.com/y4a8x}, author = {Illona Bass and Kevin A Smith and Elizabeth Bonawitz and Tomer Ullman} } @conference {4830, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, booktitle = {AAAI-21}, year = {2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5058, title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception}, number = {123}, year = {2021}, month = {03/2021}, abstract = {

The ability to perceive and reason about social interactions in the context of physical environments
is core to human social intelligence and human-machine cooperation. However, no prior dataset or
benchmark has systematically evaluated physically grounded perception of complex social interactions
that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this
work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide
range of real-life social interactions by including social concepts such as helping another agent. PHASE
consists of 2D animations of pairs of agents moving in a continuous space generated procedurally
using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact
with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE,
we design a social recognition task and a social prediction task. PHASE is validated with human
experiments demonstrating that humans perceive rich interactions in the social events, and that the
simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse
planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-
the-art feedforward neural networks. We hope that PHASE can serve as a difficult new challenge for
developing new models that can recognize complex social interactions.

}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {5062, title = {Plans or Outcomes: How Do We Attribute Intelligence to Others?}, journal = {Cognitive Science}, volume = {45}, year = {2021}, month = {09/2021}, issn = {0364-0213}, doi = {10.1111/cogs.v45.910.1111/cogs.13041}, url = {https://onlinelibrary.wiley.com/toc/15516709/45/9}, author = {Marta Kryven and Ullman, Tomer D. and Cowan, William and Joshua B. Tenenbaum} } @article {4980, title = {Selective responses to faces, scenes, and bodies in the ventral visual pathway of infants}, journal = {Current Biology}, volume = {32}, year = {2021}, month = {11/2021}, chapter = {1-20}, abstract = {

Three of the most robust functional landmarks in the human brain are the selective responses to faces in the fusiform face area (FFA), scenes in the parahippocampal place area (PPA), and bodies in the extrastriate body area (EBA). Are the selective responses of these regions present early in development or do they require many years to develop? Prior evidence leaves this question unresolved. We designed a new 32-channel infant magnetic resonance imaging (MRI) coil and collected high-quality functional MRI (fMRI) data from infants (2{\textendash}9\ months of age) while they viewed stimuli from four conditions{\textemdash}faces, bodies, objects, and scenes. We find that infants have face-, scene-, and body-selective responses in the location of the adult FFA, PPA, and EBA, respectively, powerfully constraining accounts of cortical development.

}, doi = {10.1016/j.cub.2021.10.064}, url = {https://www.sciencedirect.com/science/article/pii/S0960982221015086}, author = {Heather L Kosakowski and Cohen, Michael A. and Takahashi, Atsushi and Keil, Boris and Nancy Kanwisher and Rebecca Saxe} } @article {5053, title = {Social Interactions as Recursive MDPs}, year = {2021}, abstract = {

While machines and robots must interact with humans, providing them with social skills has been a largely overlooked topic. This is mostly a consequence of the fact that tasks such as navigation, command following, and even game playing are well-defined, while social reasoning still mostly re- mains a pre-theoretic problem. We demonstrate how social interactions can be effectively incorporated into MDPs (Markov decision processes) by reasoning recursively about the goals of other agents. In essence, our method extends the reward function to include a combination of physical goals (something agents want to accomplish in the configuration space, a traditional MDP) and social goals (something agents want to accomplish relative to the goals of other agents). Our Social MDPs allow specifying reward functions in terms of the estimated reward functions of other agents, modeling interactions such as helping or hindering another agent (by maximizing or minimizing the other agent{\textquoteright}s reward) while bal- ancing this with the actual physical goals of each agent. Our formulation allows for an arbitrary function of another agent{\textquoteright}s estimated reward structure and physical goals, enabling more complex behaviors such as politely hindering another agent or aggressively helping them. Extending Social MDPs in the same manner as I-POMDPs (Interactive-partially observed Markov decision processes) extension would enable interactions such as convincing another agent that something is true. To what extent the Social MDPs presented here and their potential Social POMDPs variant account for all possible social interactions is unknown, but having a precise mathematical model to guide questions about social in- teractions has both practical value (we demonstrate how to make zero-shot social inferences and one could imagine chatbots and robots guided by Social MDPs) and theoretical value by bringing the tools of MDP that have so successfully organized research around navigation to shed light on what social interactions really are given their extreme importance to human well-being and human civilization.

}, author = {Ravi Tejwani and Yen-Ling Kuo and Tianmin Shu and Boris Katz and Andrei Barbu} } @conference {5081, title = {Spoken ObjectNet: A Bias-Controlled Spoken Caption Dataset}, booktitle = {Interspeech 2021}, year = {2021}, month = {08/2021}, address = {ISCA}, doi = {10.21437/Interspeech.2021}, url = {https://www.isca-speech.org/archive/interspeech_2021}, author = {Palmer, Ian and Rouditchenko, Andrew and Andrei Barbu and Katz, Boris and Glass, James} } @article {5055, title = {Spoken ObjectNet: A Bias-Controlled Spoken Caption Dataset}, year = {2021}, abstract = {

Visually-grounded spoken language datasets can enable models to learn cross-modal correspon- dences with very weak supervision. However, modern audio-visual datasets contain biases that un- dermine the real-world performance of models trained on that data. We introduce Spoken ObjectNet, which is designed to remove some of these biases and provide a way to better evaluate how effec- tively models will perform in real-world scenarios. This dataset expands upon ObjectNet, which is a biascontrolled image dataset that features similar image classes to those present in ImageNet. We detail our data collection pipeline, which features several methods to improve caption quality, including automated language model checks. Lastly, we show baseline results on image retrieval and audio re- trieval tasks. These results show that models trained on other datasets and then evaluated on Spoken ObjectNet tend to perform poorly due to biases in other datasets that the models have learned. We also show evidence that the performance decrease is due to the dataset controls, and not the transfer setting.

}, author = {Ian Palmer and Andrew Rouditchenko and Andrei Barbu and Boris Katz and James Glass} } @conference {5063, title = {Temporal and Object Quantification Networks}, booktitle = {Thirtieth International Joint Conference on Artificial Intelligence {IJCAI-21}Proceedings of the Thirtieth International Joint Conference on Artificial Intelligence}, year = {2021}, month = {06/2021}, address = {Montreal, Canada}, abstract = {

We present Temporal and Object Quantification Networks (TOQ-Nets), a new class of neuro-symbolic networks with a structural bias that enables them to learn to recognize complex relational-temporal events. This is done by including reasoning layers that implement finite-domain quantification over objects and time. The structure allows them to generalize directly to input instances with varying numbers of objects in temporal sequences of varying lengths. We evaluate TOQ-Nets on input domains that require recognizing event-types in terms of complex temporal relational patterns. We demonstrate that TOQ-Nets can generalize from small amounts of data to scenarios containing more objects than were present during training and to temporal warpings of input sequences.

}, doi = {10.24963/ijcai.2021/386}, url = {https://www.ijcai.org/proceedings/2021}, author = {Mao, Jiayuan and Luo, Zhezheng and Gan, Chuang and Joshua B. Tenenbaum and Wu, Jiajun and Kaelbling, Leslie Pack and Ullman, Tomer D.}, editor = {Zhou, Zhi-Hua} } @article {5083, title = {Temporally delayed linear modelling (TDLM) measures replay in both animals and humans}, journal = {eLife}, volume = {10}, year = {2021}, month = {06/2021}, abstract = {

There are rich structures in off-task neural activity which are hypothesized to reflect fundamental computations across a broad spectrum of cognitive functions. Here, we develop an analysis toolkit - temporal delayed linear modelling (TDLM) - for analysing such activity. TDLM is a domain-general method for finding neural sequences that respect a pre-specified transition graph. It combines nonlinear classification and linear temporal modelling to test for statistical regularities in sequences of task-related reactivations. TDLM is developed on the non-invasive neuroimaging data and is designed to take care of confounds and maximize sequence detection ability. Notably, as a linear framework, TDLM can be easily extended, without loss of generality, to capture rodent replay in electrophysiology, including in continuous spaces, as well as addressing second-order inference questions, for example, its temporal and spatial varying pattern. We hope TDLM will advance a deeper understanding of neural computation and promote a richer convergence between animal and human neuroscience.

}, doi = {10.7554/eLife.66917}, url = {https://elifesciences.org/articles/66917}, author = {Liu, Yunzhe and Dolan, Raymond J and Higgins, Cameron and Hector Penagos and Woolrich, Mark W and {\'O}lafsd{\'o}ttir, H Freyja and Barry, Caswell and Kurth-Nelson, Zeb and Behrens, Timothy E} } @conference {4820, title = {Unsupervised Discovery of 3D Physical Objects}, booktitle = {International Conference on Learning Representations}, year = {2021}, month = {07/2020}, abstract = {

We study the problem of unsupervised physical object discovery. Unlike existing frameworks that aim to learn to decompose scenes into 2D segments purely based on each object{\textquoteright}s appearance, we explore how physics, especially object interactions,facilitates learning to disentangle and segment instances from raw videos, and to infer the 3D geometry and position of each object, all without supervision. Drawing inspiration from developmental psychology, our Physical Object Discovery Network (POD-Net) uses both multi-scale pixel cues and physical motion cues to accurately segment observable and partially occluded objects of varying sizes, and infer properties of those objects. Our model reliably segments objects on both synthetic and real scenes. The discovered object properties can also be used to reason about physical events.

}, url = {https://openreview.net/forum?id=lf7st0bJIA5}, author = {Yilun Du and Kevin A Smith and Tomer Ullman and Joshua B. Tenenbaum and Jiajun Wu} } @conference {5107, title = {On the use of Cortical Magnification and Saccades as Biological Proxies for Data Augmentation}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) Workshop at NeurIPS}, year = {2021}, keywords = {Active Perception, Data-Augmentation, Foveation, Self-Supervised Learning}, url = {https://openreview.net/forum?id=Rpazl253IHb}, author = {Binxu Wang and David Mayo and Arturo Deza and Andrei Barbu and Colin Conwell} } @article {4955, title = {Vector-based pedestrian navigation in cities}, journal = {Nature Computational Science}, volume = {1}, year = {2021}, month = {10/2021}, pages = {678 - 685}, abstract = {

How do pedestrians choose their paths within city street networks? Researchers have tried to shed light on this matter through strictly controlled experiments, but an ultimate answer based on real-world mobility data is still lacking. Here, we analyze salient features of human path planning through a statistical analysis of a massive dataset of GPS traces, which reveals that (1) people increasingly deviate from the shortest path when the distance between origin and destination increases and (2) chosen paths are statistically different when origin and destination are swapped. We posit that direction to goal is a main driver of path planning and develop a vector-based navigation model; the resulting trajectories, which we have termed pointiest paths, are a statistically better predictor of human paths than a model based on minimizing distance with stochastic effects. Our findings generalize across two major US cities with different street networks, hinting to the fact that vector-based navigation might be a universal property of human path planning.

}, doi = {10.1038/s43588-021-00130-y}, url = {https://www.nature.com/articles/s43588-021-00130-y}, author = {Bongiorno, Christian and Zhou, Yulun and Marta Kryven and Theurel, David and Rizzo, Alessandro and Santi, Paolo and Joshua B. Tenenbaum and Ratti, Carlo} } @article {5037, title = {Visual Search Asymmetry: Deep Nets and Humans Share Similar Inherent Biases}, year = {2021}, month = {12/2021}, abstract = {

Visual search is a ubiquitous and often challenging daily task, exemplified by looking for the car keys at home or a friend in a crowd. An intriguing property of some classical search tasks is an asymmetry such that finding a target A among distractors B can be easier than finding B among A. To elucidate the mechanisms responsible for asymmetry in visual search, we propose a computational model that takes a target and a search image as inputs and produces a sequence of eye movements until the target is found. The model integrates eccentricity-dependent visual recognition with target-dependent top-down cues. We compared the model against human behavior in six paradigmatic search tasks that show asymmetry in humans. Without prior exposure to the stimuli or task-specific training, the model provides a plausible mechanism for search asymmetry. We hypothesized that the polarity of search asymmetry arises from experience with the natural environment. We tested this hypothesis by training the model on augmented versions of ImageNet where the biases of natural images were either removed or reversed. The polarity of search asymmetry disappeared or was altered depending on the training protocol. This study highlights how classical perceptual properties can emerge in neural network models, without the need for task-specific training, but rather as a consequence of the statistical properties of the developmental diet fed to the model. All source code and data are publicly available at https://github.com/kreimanlab/VisualSearchAsymmetry.

}, url = {https://nips.cc/Conferences/2021/Schedule?showEvent=28848}, author = {Shashi Kant Gupta and Mengmi Zhang and CHIA-CHIEN WU and Jeremy Wolfe and Gabriel Kreiman} } @article {4832, title = {What Is the Model in Model-Based Planning?}, journal = {Cognitive Science}, volume = {45}, year = {2021}, month = {01/2021}, abstract = {

Flexibility is one of the hallmarks of human problem-solving. In everyday life, people adapt to changes in common tasks with little to no additional training. Much of the existing work on flexibility in human problem-solving has focused on how people adapt to tasks in new domains by drawing on solutions from previously learned domains. In real-world tasks, however, humans must generalize across a wide range of within-domain variation. In this work we argue that representational abstraction plays an important role in such within-domain generalization. We then explore the nature of this representational abstraction in realistically complex tasks like video games by demonstrating how the same model-based planning framework produces distinct generalization behaviors under different classes of task representation. Finally, we compare the behavior of agents with these task representations to humans in a series of novel grid-based video game tasks. Our results provide evidence for the claim that within-domain flexibility in humans derives from task representations composed of propositional rules written in terms of objects and relational categories.

}, issn = {0364-0213}, doi = {10.1111/cogs.v45.110.1111/cogs.12928}, url = {https://onlinelibrary.wiley.com/toc/15516709/45/1}, author = {Pouncy, Thomas and Tsividis, Pedro and Samuel J Gershman} } @conference {5108, title = {What Matters In Branch Specialization? Using a Toy Task to Make Predictions}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) Workshop at NeurIPS}, year = {2021}, keywords = {branch specialization, computational vision, curriculum learning}, url = {https://openreview.net/forum?id=0kPS1i6wict}, author = {Chenguang Li and Arturo Deza} } @proceedings {5042, title = {When Pigs Fly: Contextual Reasoning in Synthetic and Natural Scenes}, year = {2021}, month = {08/2021}, abstract = {

Context is of fundamental importance to both human and machine vision; e.g., an object in the air is more likely to be an airplane than a pig. The rich notion of context incorporates several aspects including physics rules, statistical co-occurrences, and relative object sizes, among others. While previous work has focused on crowd-sourced out-of-context photographs from the web to study scene context, controlling the nature and extent of contextual violations has been a daunting task. Here we introduce a diverse, synthetic Out-of-Context Dataset (OCD) with fine-grained control over scene context. By leveraging a 3D simulation engine, we systematically control the gravity, object co-occurrences and relative sizes across 36 object categories in a virtual household environment. We conducted a series of experiments to gain insights into the impact of contextual cues on both human and machine vision using OCD. We conducted psychophysics experiments to establish a human benchmark for out-of-context recognition, and then compared it with state-of-the-art computer vision models to quantify the gap between the two. We propose a context-aware recognition transformer model, fusing object and contextual information via multi-head attention. Our model captures useful information for contextual reasoning, enabling human-level performance and better robustness in out-of-context conditions compared to baseline models across OCD and other out-of-context datasets. All source code and data are publicly available at https://github.com/kreimanlab/ WhenPigsFlyContext

}, doi = {10.1109/iccv48922.2021.00032}, author = {Philipp Bomatter and Mengmi Zhang and Dimitar Karev and Spandan Madan and Claire Tseng and Gabriel Kreiman} } @article {4406, title = {The ability to predict actions of others from distributed cues is still developing in children}, journal = {PsyArXiv Preprints}, year = {2020}, month = {01/2020}, abstract = {

Adults use distributed cues in the bodies of others to predict and counter their actions. To investigate the development of this ability, adults and 6- to 8-year-old children played a competitive game with a confederate who reached toward one of two targets. Child and adult participants, who sat across from the confederate, attempted to beat the confederate to the target by touching it before the confederate did. Adults used cues distributed through the head, shoulders, and body to predict the reaching actions. Children, in contrast, used cues in the arms and torso but not in the head, face or shoulders to predict the actions. These results provide evidence for a qualitative change in the ability to respond rapidly to predictive cues to others{\textquoteright} actions develops slowly over childhood. Despite children{\textquoteright}s sensitivity to eye gaze even in infancy, cues from the head and body do not influence their action predictions as late as 8 years of age.

}, keywords = {Action prediction, action understanding, Biological motion, development, Social interaction}, doi = {10.31234/osf.io/pu3tf}, author = {Daniel Kim and Emalie McMahon and Samuel Mehr and Ken Nakayama and Elizabeth S Spelke and Maryam Vaziri-Pashkam} } @article {4727, title = {Acute social isolation evokes midbrain craving responses similar to hunger}, journal = {Nature Neuroscience}, volume = {23}, year = {2020}, month = {11/2020}, pages = {1597 - 1605}, abstract = {

When people are forced to be isolated from each other, do they crave social interactions? To address this question, we used functional magnetic resonance imaging to measure neural responses evoked by food and social cues after participants (n = 40) experienced 10 h of mandated fasting or total social isolation. After isolation, people felt lonely and craved social interaction. Midbrain regions showed selective activation to food cues after fasting and to social cues after isolation; these responses were correlated with self-reported craving. By contrast, striatal and cortical regions differentiated between craving food and craving social interaction. Across deprivation sessions, we found that deprivation narrows and focuses the brain{\textquoteright}s motivational responses to the deprived target. Our results support the intuitive idea that acute isolation causes social craving, similar to the way fasting causes hunger.

}, issn = {1097-6256}, doi = {10.1038/s41593-020-00742-z}, url = {http://www.nature.com/articles/s41593-020-00742-z}, author = {Tomova, Livia and Wang, Kimberly L. and Thompson, Todd and Matthews, Gillian A. and Takahashi, Atsushi and Tye, Kay M. and Rebecca Saxe} } @conference {4699, title = {AI Feynman 2.0: Pareto-optimal symbolic regression exploiting graph modularity}, booktitle = {Advances in Neural Information Processing Systems 33 pre-proceedings (NeurIPS 2020)}, year = {2020}, month = {12/2020}, abstract = {

We present an improved method for symbolic regression that seeks to fit data toformulas that are Pareto-optimal, in the sense of having the best accuracy for a givencomplexity. It improves on the previous state-of-the-art by typically being ordersof magnitude more robust toward noise and bad data, and also by discovering manyformulas that stumped previous methods. We develop a method for discoveringgeneralized symmetries (arbitrary modularity in the computational graph of aformula) from gradient properties of a neural network fit. We use normalizingflows to generalize our symbolic regression method to probability distributionsfrom which we only have samples, and employ statistical hypothesis testing toaccelerate robust brute-force search.

GitHub:\ https://github.com/SJ001/AI-Feynman
Readthedocs: https://ai-feynman.readthedocs.io/en/latest/
Database:\ https://space.mit.edu/home/tegmark/aifeynman.html
}, author = {Silviu-Marian Udrescu and Andrew Tan and Jianhai Feng and Orisvaldo Neto and Tailin Wu and Max Tegmark} } @article {4754, title = {An analysis of training and generalization errors in shallow and deep networks}, journal = {Neural Networks}, volume = {121}, year = {2020}, month = {01/2020}, pages = {229 - 241}, abstract = {

This paper is motivated by an open problem around deep networks, namely, the apparent absence of over-fitting despite large over-parametrization which allows perfect fitting of the training data. In this paper, we analyze this phenomenon in the case of regression problems when each unit evaluates a periodic activation function. We argue that the minimal expected value of the square loss is inappropriate to measure the generalization error in approximation of compositional functions in order to take full advantage of the compositional structure. Instead, we measure the generalization error in the sense of maximum loss, and sometimes, as a pointwise error. We give estimates on exactly how many parameters ensure both zero training error as well as a good generalization error. We prove that a solution of a regularization problem is guaranteed to yield a good training error as well as a good generalization error and estimate how much error to expect at which test data.

}, keywords = {deep learning, generalization error, interpolatory approximation}, issn = {08936080}, doi = {10.1016/j.neunet.2019.08.028}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0893608019302552}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @article {4800, title = {Analyzing Machine-Learned Representations: A Natural Language Case Study}, journal = {Cognitive Science}, volume = {44}, year = {2020}, month = {12/2020}, abstract = {

As modern deep networks become more complex, and get closer to human-like capabilities in certain domains, the question arises as to how the representations and decision rules they learn compare to the ones in humans. In this work, we study representations of sentences in one such artificial system for natural language processing. We first present a diagnostic test dataset to examine the degree of abstract composable structure represented. Analyzing performance on these diagnostic tests indicates a lack of systematicity in representations and decision rules, and reveals a set of heuristic strategies. We then investigate the effect of training distribution on learning these heuristic strategies, and we study changes in these representations with various augmentations to the training set. Our results reveal parallels to the analogous representations in people. We find that these systems can learn abstract rules and generalize them to new contexts under certain circumstances{\textemdash}similar to human zero-shot reasoning. However, we also note some shortcomings in this generalization behavior{\textemdash}similar to human judgment errors like belief bias. Studying these parallels suggests new ways to understand psychological phenomena in humans as well as informs best strategies for building artificial intelligence with human-like language understanding.

}, issn = {0364-0213}, doi = {10.1111/cogs.12925}, url = {https://onlinelibrary.wiley.com/toc/15516709/44/12}, author = {Dasgupta, Ishita and Guo, Demi and Samuel J Gershman and Goodman, Noah D.} } @article {4802, title = {Bayesian Models of Conceptual Development: Learning as Building Models of the World}, journal = {Annual Review of Developmental Psychology}, volume = {2}, year = {2020}, month = {12/2021}, pages = {533 - 558}, issn = {2640-7922}, doi = {10.1146/annurev-devpsych-121318-084833}, url = {https://www.annualreviews.org/doi/10.1146/annurev-devpsych-121318-084833}, author = {Ullman, Tomer D. and Joshua B. Tenenbaum} } @article {4818, title = {Beyond the feedforward sweep: feedback computations in the visual cortex}, journal = {Annals of the New York Academy of Sciences}, volume = {1464}, year = {2020}, month = {02/2020}, pages = {222 - 241}, abstract = {

Visual perception involves the rapid formation of a coarse image representation at the onset of visual processing, which is iteratively refined by late computational processes. These early versus late time windows approximately map onto feedforward and feedback processes, respectively. State-of-the-art convolutional neural networks, the main engine behind recent machine vision successes, are feedforward architectures. Their successes and limitations provide critical information regarding which visual tasks can be solved by purely feedforward processes and which require feedback mechanisms. We provide an overview of recent work in cognitive neuroscience and machine vision that highlights the possible role of feedback processes for both visual recognition and beyond. We conclude by discussing important open questions for future research.

}, issn = {0077-8923}, doi = {10.1111/nyas.v1464.110.1111/nyas.14320}, url = {https://onlinelibrary.wiley.com/toc/17496632/1464/1}, author = {Gabriel Kreiman and Serre, Thomas} } @article {4455, title = {Beyond the feedforward sweep: feedback computations in the visual cortex}, journal = {Ann. N.Y. Acad. Sci. | Special Issue: The Year in Cognitive Neuroscience}, volume = {1464}, year = {2020}, month = {02/2020}, pages = {222-241}, abstract = {

Visual perception involves the rapid formation of a coarse image representation at the onset of visual processing, which is iteratively refined by late computational processes. These early versus late time windows approximately map onto feedforward and feedback processes, respectively. State-of-the-art convolutional neural networks, the main engine behind recent machine vision successes, are feedforward architectures. Their successes and limitations provide critical information regarding which visual tasks can be solved by purely feedforward processes and which require feedback mechanisms. We provide an overview of recent work in cognitive neuroscience and machine vision that highlights the possible role of feedback processes for both visual recognition and beyond. We conclude by discussing important open questions for future research.

}, keywords = {deeplearning;neuralnetworks;machinevision;visualreasoning;categorization;grouping}, doi = {10.1111/nyas.14320}, url = {https://nyaspubs.onlinelibrary.wiley.com/doi/10.1111/nyas.14320}, author = {Gabriel Kreiman and Serre, Thomas} } @article {4572, title = {Biologically Inspired Mechanisms for Adversarial Robustness}, year = {2020}, month = {06/2020}, abstract = {

A convolutional neural network strongly robust to adversarial perturbations at reasonable computational and performance cost has not yet been demonstrated. The primate visual ventral stream seems to be robust to small perturbations in visual stimuli but the underlying mechanisms that give rise to this robust perception are not understood. In this work, we investigate the role of two biologically plausible mechanisms in adversarial robustness. We demonstrate that the non-uniform sampling performed by the primate retina and the presence of multiple receptive fields with a range of receptive field sizes at each eccentricity improve the robustness of neural networks to small adversarial perturbations. We verify that these two mechanisms do not suffer from gradient obfuscation and study their contribution to adversarial robustness through ablation studies.

}, author = {Manish Vuyyuru Reddy and Andrzej Banburski and Nishka Pant and Tomaso Poggio} } @article {4457, title = {Can Deep Learning Recognize Subtle Human Activities?}, journal = {CVPR 2020}, year = {2020}, month = {01/2020}, author = {Jacquot, V and Ying, J and Gabriel Kreiman} } @article {4468, title = {Can we Contain Covid-19 without Locking-down the Economy?}, year = {2020}, month = {03/2020}, abstract = {

We present an analysis of a risk-based selective quarantine model where the population is divided into low and high-risk groups. The high-risk group is quarantined until the low-risk group achieves herd-immunity. We tackle the question of whether this model is safe, in the sense that the health system can contain the number of low-risk people that require severe ICU care (such as life support systems).

}, author = {Shai Shalev-Shwartz and Amnon Shashua} } @article {4583, title = {On the Capability of Neural Networks to Generalize to Unseen Category-Pose Combinations}, year = {2020}, month = {07/2020}, abstract = {

Recognizing an object{\textquoteright}s category and pose lies at the heart of visual understanding. Recent works suggest that deep neural networks (DNNs) often fail to generalize to category-pose combinations not seen during training. However, it is unclear when and how such generalization may be possible. Does the number of combinations seen during training impact generalization? Is it better to learn category and pose in separate networks, or in a single shared network? Furthermore, what are the neural mechanisms that drive the network{\textquoteright}s generalization? In this paper, we answer these questions by analyzing state-of-the-art DNNs trained to recognize both object category and pose (position, scale, and 3D viewpoint) with quantitative control over the number of category-pose combinations seen during training. We also investigate the emergence of two types of specialized neurons that can explain generalization to unseen combinations{\textemdash}neurons selective to category and invariant to pose, and vice versa. We perform experiments on MNIST extended with position or scale, the iLab dataset with vehicles at different viewpoints, and a challenging new dataset for car model recognition and viewpoint estimation that we introduce in this paper, the Biased-Cars dataset. Our results demonstrate that as the number of combinations seen during training increases, networks generalize better to unseen category-pose combinations, facilitated by an increase in the selectivity and invariance of individual neurons. We find that learning category and pose in separate networks compared to a shared one leads to an increase in such selectivity and invariance, as separate networks are not forced to preserve information about both category and pose. This enables separate networks to significantly outperform shared ones at predicting unseen category-pose combinations.

}, author = {Spandan Madan and Timothy Henry and Jamell Dozier and Helen Ho and Nishchal Bhandari and Tomotake Sasaki and Fredo Durand and Hanspeter Pfister and Xavier Boix} } @article {4803, title = {Communicating Compositional Patterns}, journal = {Open Mind}, volume = {4}, year = {2020}, month = {08/2020}, pages = {25 - 39}, doi = {10.1162/opmi_a_00032}, url = {https://direct.mit.edu/opmi/article/95939}, author = {Schulz, Eric and Quiroga, Francisco and Samuel J Gershman} } @article {4433, title = {Complexity Control by Gradient Descent in Deep Networks}, journal = {Nature Communications}, volume = {11}, year = {2020}, month = {02/2020}, abstract = {

Overparametrized deep network predict well despite the lack of an explicit complexity control during training such as an explicit regularization term. For exponential-type loss functions, we solve this puzzle by showing an effective regularization effect of gradient descent in terms of the normalized weights that are relevant for classification.

}, doi = {https://doi.org/10.1038/s41467-020-14663-9}, url = {https://www.nature.com/articles/s41467-020-14663-9}, author = {Tomaso Poggio and Qianli Liao and Andrzej Banburski} } @conference {4697, title = {CUDA-Optimized real-time rendering of a Foveated Visual System}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020}, year = {2020}, month = {12/2020}, abstract = {

The spatially-varying field of the human visual system has recently received a resurgence of interest with the development of virtual reality (VR) and neural networks. The computational demands of high resolution rendering desired for VR can be offset by savings in the periphery [16], while neural networks trained with foveated input have shown perceptual gains in i.i.d and o.o.d generalization [25, 6]. In this paper, we present a technique that exploits the CUDA GPU architecture to efficiently generate Gaussian-based foveated images at high definition (1920px {\texttimes} 1080px) in real-time (165 Hz), with a larger number of pooling regions than previous Gaussian-based foveation algorithms by several orders of magnitude [10, 25], producing a smoothly foveated image that requires no further blending or stitching, and that can be well fit for any contrast sensitivity function. The approach described can be adapted from Gaussian blurring to any eccentricity-dependent image processing and our algorithm can meet demand for experimentation to evaluate the role of spatially-varying processing across biological and artificial agents, so that foveation can be added easily on top of existing systems rather than forcing their redesign ({\textquotedblleft}emulated foveated renderer{\textquotedblright} [22]). Altogether, this paper demonstrates how a GPU, with a CUDA block-wise architecture, can be employed for radially-variant rendering, with opportunities for more complex post-processing to ensure a metameric foveation scheme [33].

}, url = {https://arxiv.org/abs/2012.08655}, author = {Elian Malkin and Arturo Deza and Tomaso Poggio} } @article {5060, title = {Deep compositional robotic planners that follow natural language commands}, year = {2020}, abstract = {

We demonstrate how a sampling-based robotic planner can be augmented to learn to understand a sequence of natural language commands in a continuous configuration space to move and manipu- late objects. Our approach combines a deep network structured according to the parse of a complex command that includes objects, verbs, spatial relations, and attributes, with a sampling-based planner, RRT. A recurrent hierarchical deep network controls how the planner explores the environment, de- termines when a planned path is likely to achieve a goal, and estimates the confidence of each move to trade off exploitation and exploration between the network and the planner. Planners are designed to have near-optimal behavior when information about the task is missing, while networks learn to ex- ploit observations which are available from the environment, making the two naturally complementary. Combining the two enables generalization to new maps, new kinds of obstacles, and more complex sentences that do not occur in the training set. Little data is required to train the model despite it jointly acquiring a CNN that extracts features from the environment as it learns the meanings of words. The model provides a level of interpretability through the use of attention maps allowing users to see its reasoning steps despite being an end-to-end model. This end-to-end model allows robots to learn to follow natural language commands in challenging continuous environments.

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4519, title = {Deep compositional robotic planners that follow natural language commands }, booktitle = {International Conference on Robotics and Automation (ICRA)}, year = {2020}, month = {05/2020}, address = {Palais des Congr{\`e}s de Paris, Paris, France}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @article {4478, title = {Do Neural Networks for Segmentation Understand Insideness?}, year = {2020}, month = {04/2020}, abstract = {

The insideness problem is an image segmentation modality that consists of determining which pixels are inside and outside a region. Deep Neural Networks (DNNs) excel in segmentation benchmarks, but it is unclear that they have the ability to solve the insideness problem as it requires evaluating long-range spatial dependencies. In this paper, the insideness problem is analyzed in isolation, without texture or semantic cues, such that other aspects of segmentation do not interfere in the analysis. We demonstrate that DNNs for segmentation with few units have sufficient complexity to solve insideness for any curve. Yet, such DNNs have severe problems to learn general solutions. Only recurrent networks trained with small images learn solutions that generalize well to almost any curve. Recurrent networks can decompose the evaluation of long-range dependencies into a sequence of local operations, and learning with small images alleviates the common difficulties of training recurrent networks with a large number of unrolling steps.

}, author = {Kimberly M. Villalobos and Vilim Štih and Amineh Ahmadinejad and Shobhita Sundaram and Jamell Dozier and Andrew Francl and Frederico Azevedo and Tomotake Sasaki and Xavier Boix} } @article {4680, title = {Dreaming with ARC}, year = {2020}, month = {11/2020}, abstract = {

Current machine learning algorithms are highly specialized to whatever it is they are meant to do {\textendash}{\textendash} e.g. playing chess, picking up objects, or object recognition.\  How can we extend this to a system that could solve a wide range of problems?\  We argue that this can be achieved by a modular system {\textendash}{\textendash} one that can adapt to solving different problems by changing only the modules chosen and the order in which those modules are applied to the problem. The recently introduced ARC (Abstraction and Reasoning Corpus) dataset serves as an excellent test of abstract reasoning. Suited to the modular approach, the tasks depend on a set of human Core Knowledge inbuilt priors. In this paper we implement these priors as the modules of our system. We combine these modules using a neural-guided program synthesis.\ 

}, author = {Andrzej Banburski and Anshula Gandhi and Simon Alford and Sylee Dandekar and Peter Chin and Tomaso Poggio} } @article {4448, title = {Efficient inverse graphics in biological face processing}, journal = {Science Advances}, volume = {6}, year = {2020}, month = {03/2020}, pages = {eaax5979}, abstract = {

Vision not only detects and recognizes objects, but performs rich inferences about the underlying scene structure that causes the patterns of light we see. Inverting generative models, or {\textquotedblleft}analysis-by-synthesis{\textquotedblright}, presents a possible solution, but its mechanistic implementations have typically been too slow for online perception, and their mapping to neural circuits remains unclear. Here we present a neurally plausible efficient inverse graphics model and test it in the domain of face recognition. The model is based on a deep neural network that learns to invert a three-dimensional face graphics program in a single fast feedforward pass. It explains human behavior qualitatively and quantitatively, including the classic {\textquotedblleft}hollow face{\textquotedblright} illusion, and it maps directly onto a specialized face-processing circuit in the primate brain. The model fits both behavioral and neural data better than state-of-the-art computer vision models, and suggests an interpretable reverse-engineering account of how the brain transforms images into percepts.

}, doi = {10.1126/sciadv.aax5979}, url = {https://advances.sciencemag.org/lookup/doi/10.1126/sciadv.aax5979}, author = {Ilker Yildirim and Mario Belledonne and W. A. Freiwald and Joshua B. Tenenbaum} } @conference {4540, title = {Emergence of Pragmatic Reasoning From Least-Effort Optimization }, booktitle = {13th International Conference on the Evolution of Language (EvoLang) }, year = {2020}, month = {04/2020}, address = {The conference was canceled due to Covid-19}, author = {Noga Zaslavsky and Jennifer Hu and Roger Levy} } @article {5061, title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas}, year = {2020}, abstract = {

We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multi- task agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?

}, author = {Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4804, title = {Encoding formulas as deep networks: Reinforcement learning for zero-shot execution of LTL formulas}, booktitle = {2020 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)}, year = {2020}, address = {Las Vegas, NV, USA}, abstract = {

We demonstrate a reinforcement learning agent which uses a compositional recurrent neural network that takes as input an LTL formula and determines satisfying actions. The input LTL formulas have never been seen before, yet the network performs zero-shot generalization to satisfy them. This is a novel form of multi-task learning for RL agents where agents learn from one diverse set of tasks and generalize to a new set of diverse tasks. The formulation of the network enables this capacity to generalize. We demonstrate this ability in two domains. In a symbolic domain, the agent finds a sequence of letters that is accepted. In a Minecraft-like environment, the agent finds a sequence of actions that conform to the formula. While prior work could learn to execute one formula reliably given examples of that formula, we demonstrate how to encode all formulas reliably. This could form the basis of new multitask agents that discover sub-tasks and execute them without any additional training, as well as the agents which follow more complex linguistic commands. The structures required for this generalization are specific to LTL formulas, which opens up an interesting theoretical question: what structures are required in neural networks for zero-shot generalization to different logics?

}, doi = {10.1109/IROS45743.2020.9341325}, url = {https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=9340668}, author = {Kuo, Yen-Ling and Katz, Boris and Andrei Barbu} } @conference {4532, title = {Evidence that recurrent pathways between the prefrontal and inferior temporal cortex is critical during core object recognition }, booktitle = {COSYNE}, year = {2020}, month = {02/2020}, address = {Denver, Colorado, USA}, author = {Kohitij Kar and James J. DiCarlo} } @article {4482, title = {An Exit Strategy from the Covid-19 Lockdown based on Risk-sensitive Resource Allocation}, year = {2020}, month = {04/2020}, abstract = {

We propose an exit strategy from the COVID-19 lockdown, which is based on a risk-sensitive levels of social distancing. At the heart of our approach is the realization that the most effective, yet limited in number, resources should protect those at high risk rather than applied uniformly across the population. By generalizing the SEIR model to mixed populations, and based on existing data in Israel, we present an analysis of the maximal load on the health system and the total mortality. We argue that risk-sensitive resource allocation combined with risk-sensitive levels of social distancing enables to lower the overall mortality toll in parallel to resumption of economic activity.

}, author = {Shai Shalev-Shwartz and Amnon Shashua} } @article {5073, title = {Explicit regularization and implicit bias in deep network classifiers trained with the square loss}, journal = {arXiv}, year = {2020}, month = {12/2020}, abstract = {

Deep ReLU networks trained with the square loss have been observed to perform well in classification tasks. We provide here a theoretical justification based on analysis of the associated gradient flow. We show that convergence to a solution with the absolute minimum norm is expected when normalization techniques such as Batch Normalization (BN) or Weight Normalization (WN) are used together with Weight Decay (WD). The main property of the minimizers that bounds their expected error is the norm: we prove that among all the close-to-interpolating solutions, the ones associated with smaller Frobenius norms of the unnormalized weight matrices have better margin and better bounds on the expected classification error. With BN but in the absence of WD, the dynamical system is singular. Implicit dynamical regularization -- that is zero-initial conditions biasing the dynamics towards high margin solutions -- is also possible in the no-BN and no-WD case. The theory yields several predictions, including the role of BN and weight decay, aspects of Papyan, Han and Donoho{\textquoteright}s Neural Collapse and the constraints induced by BN on the network weights.

}, url = {https://arxiv.org/abs/2101.00072}, author = {Tomaso Poggio and Qianli Liao} } @article {4805, title = {Face selective patches in marmoset frontal cortexAbstract}, journal = {Nature Communications}, volume = {11}, year = {2020}, month = {12/2020}, abstract = {

In humans and macaque monkeys, socially relevant face processing is accomplished via a distributed functional network that includes specialized patches in frontal cortex. It is unclear whether a similar network exists in New World primates, who diverged ~35 million years from Old World primates. The common marmoset is a New World primate species ideally placed to address this question given their complex social repertoire. Here, we demonstrate the existence of a putative high-level face processing network in marmosets. Like Old World primates, marmosets show differential activation in anterior cingulate and lateral prefrontal cortices while they view socially relevant videos of marmoset faces. We corroborate the locations of these frontal regions by demonstrating functional and structural connectivity between these regions and temporal lobe face patches. Given the evolutionary separation between macaques and marmosets, our results suggest this frontal network specialized for social face processing predates the separation between Platyrrhini and Catarrhini.

}, doi = {10.1038/s41467-020-18692-2}, url = {http://www.nature.com/articles/s41467-020-18692-2}, author = {Schaeffer, David J. and Selvanayagam, Janahan and Johnston, Kevin D. and Menon, Ravi S. and W. A. Freiwald and Everling, Stefan} } @article {4659, title = {Fast Recurrent Processing via Ventrolateral Prefrontal Cortex Is Needed by the Primate Ventral Stream for Robust Core Visual Object Recognition}, journal = {Neuron}, year = {2020}, month = {10/2020}, abstract = {

Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core object recognition require additional time to develop for specific, {\textquoteleft}{\textquoteleft}late-solved{\textquoteright}{\textquoteright} images. This suggests the necessity of recurrent processing in these computations. Which brain circuits are responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventrolateral prefrontal cortex (vlPFC) is a critical recurrent node in this system, here, we pharmacologically inactivated parts of vlPFC and simultaneously measured IT activity while monkeys performed object discrimination tasks. vlPFC inactivation deteriorated the quality of late-phase (\>150 ms from image onset) IT population code and produced commensurate behavioral deficits for late-solved images. Finally, silencing vlPFC caused the monkeys{\textquoteright} IT activity and behavior to become more like those produced by feedforward-only ventral stream models. Together with prior work, these results implicate fast recurrent processing through vlPFC as critical to producing behaviorally sufficient object representations in IT.

}, issn = {08966273}, doi = {10.1016/j.neuron.2020.09.035}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627320307595}, author = {Kohitij Kar and James J. DiCarlo} } @conference {4817, title = {The fine structure of surprise in intuitive physics: when, why, and how much?}, booktitle = {Proceedings of the 42th Annual Meeting of the Cognitive Science Society - Developing a Mind: Learning in Humans, Animals, and Machines, CogSci 2020, virtual, July 29 - August 1, 2020}, year = {2020}, url = {https://cogsci.mindmodeling.org/2020/papers/0761/index.html}, author = {Kevin A Smith and Lingjie Mei and Shunyu Yao and Jiajun Wu and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman}, editor = {Stephanie Denison and Michael Mack and Yang Xu and Blair C. Armstrong} } @article {4568, title = {For interpolating kernel machines, the minimum norm ERM solution is the most stable}, number = {108}, year = {2020}, month = {06/2020}, abstract = {

We study the average CVloo stability of kernel ridge-less regression and derive corresponding risk bounds. We show that the interpolating solution with minimum norm has the best CVloo stability, which in turn is controlled by the condition number of the empirical kernel matrix. The latter can be characterized in the asymptotic regime where both the dimension and cardinality of the data go to infinity. Under the assumption of random kernel matrices, the corresponding test error follows a double descent curve.

}, author = {Akshay Rangamani and Lorenzo Rosasco and Tomaso Poggio} } @article {4750, title = {Function approximation by deep networks}, journal = {Communications on Pure \& Applied Analysis}, volume = {19}, year = {2020}, month = {08/2020}, pages = {4085 - 4095}, abstract = {

We show that deep networks are better than shallow networks at approximating functions that can be expressed as a composition of functions described by a directed acyclic graph, because the deep networks can be designed to have the same compositional structure, while a shallow network cannot exploit this knowledge. Thus, the blessing of compositionality mitigates the curse of dimensionality. On the other hand, a theorem called good propagation of errors allows to "lift" theorems about shallow networks to those about deep networks with an appropriate choice of norms, smoothness, etc. We illustrate this in three contexts where each channel in the deep network calculates a spherical polynomial, a non-smooth ReLU network, or another zonal function network related closely with the ReLU network.

}, keywords = {approximation on the Euclidean sphere, deep networks, degree of approximation}, issn = {1553-5258}, doi = {10.3934/cpaa.2020181}, url = {http://aimsciences.org//article/doi/10.3934/cpaa.2020181}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @article {4806, title = {Gross means Great}, journal = {Progress in Neurobiology}, volume = {195}, year = {2020}, month = {12/2020}, pages = {101924}, issn = {03010082}, doi = {10.1016/j.pneurobio.2020.101924}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0301008220301799}, author = {W. A. Freiwald} } @conference {4527, title = {Hierarchical neural network models that more closely match primary visual cortex tend to better explain higher level visual cortical responses }, booktitle = {COSYNE}, year = {2020}, month = {02/2020}, address = {Denver, Colorado, USA}, author = {Tiago Marques and Martin Schrimpf and James J. DiCarlo} } @article {4807, title = {Hierarchical structure is employed by humans during visual motion perception}, journal = {Proceedings of the National Academy of Sciences}, volume = {117}, year = {2020}, month = {09/2022}, pages = {24581 - 24589}, issn = {0027-8424}, doi = {10.1073/pnas.2008961117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2008961117}, author = {Bill, Johannes and Pailian, Hrag and Samuel J Gershman and Drugowitsch, Jan} } @article {4570, title = {Hierarchically Local Tasks and Deep Convolutional Networks}, year = {2020}, month = {06/2020}, abstract = {

The main success stories of deep learning, starting with ImageNet, depend on convolutional networks, which on certain tasks perform significantly better than traditional shallow classifiers, such as support vector machines. Is there something special about deep convolutional networks that other learning machines do not possess? Recent results in approximation theory have shown that there is an exponential advantage of deep convolutional-like networks in approximating functions with hierarchical locality in their compositional structure. These mathematical results, however, do not say which tasks are expected to have input-output functions with hierarchical locality. Among all the possible hierarchically local tasks in vision, text and speech we explore a few of them experimentally by studying how they are affected by disrupting locality in the input images. We also discuss a taxonomy of tasks ranging from local, to hierarchically local, to global and make predictions about the type of networks required to perform\  efficiently on these different types of tasks.

}, keywords = {Compositionality, Inductive Bias, perception, Theory of Deep Learning}, author = {Arturo Deza and Qianli Liao and Andrzej Banburski and Tomaso Poggio} } @article {4808, title = {Hippocampal remapping as hidden state inference}, journal = {eLife}, volume = {9}, year = {2020}, month = {06/2020}, abstract = {

Cells in the hippocampus tuned to spatial location (place cells) typically change their tuning when an animal changes context, a phenomenon known as remapping. A fundamental challenge to understanding remapping is the fact that what counts as a {\quotesinglbase}{\"A}{\`o}{\quotesinglbase}{\"A}{\`o}context change{\quotesinglbase}{\"A}{\^o}{\quotesinglbase}{\"A}{\^o} has never been precisely defined. Furthermore, different remapping phenomena have been classified on the basis of how much the tuning changes after different types and degrees of context change, but the relationship between these variables is not clear. We address these ambiguities by formalizing remapping in terms of hidden state inference. According to this view, remapping does not directly reflect objective, observable properties of the environment, but rather subjective beliefs about the hidden state of the environment. We show how the hidden state framework can resolve a number of puzzles about the nature of remapping.

}, doi = {10.7554/eLife.51140}, url = {https://elifesciences.org/articles/51140}, author = {Honi Sanders and Matthew A. Wilson and Samuel J Gershman} } @article {4625, title = {Implicit dynamic regularization in deep networks}, year = {2020}, month = {08/2020}, abstract = {

Square loss has been observed to perform well in classification tasks, at least as well as crossentropy. However, a theoretical justification is lacking. Here we develop a theoretical analysis for the square loss that \ complements the existing asymptotic analysis for the exponential loss.

}, author = {Tomaso Poggio and Qianli Liao and Mengjia Xu} } @article {4656, title = {Incorporating intrinsic suppression in deep neural networks captures dynamics of adaptation in neurophysiology and perception}, journal = {Science Advances}, volume = {6}, year = {2020}, month = {10/2020}, pages = {eabd4205}, abstract = {

Adaptation is a fundamental property of sensory systems that can change subjective experiences in the context of recent information. Adaptation has been postulated to arise from recurrent circuit mechanisms or as a consequence of neuronally intrinsic suppression. However, it is unclear whether intrinsic suppression by itself can account for effects beyond reduced responses. Here, we test the hypothesis that complex adaptation phenomena can emerge from intrinsic suppression cascading through a feedforward model of visual processing. A deep convolutional neural network with intrinsic suppression captured neural signatures of adaptation including novelty detection, enhancement, and tuning curve shifts, while producing aftereffects consistent with human perception. When adaptation was trained in a task where repeated input affects recognition performance, an intrinsic mechanism generalized better than a recurrent neural network. Our results demonstrate that feedforward propagation of intrinsic suppression changes the functional state of the network, reproducing key neurophysiological and perceptual properties of adaptation.

}, doi = {10.1126/sciadv.abd4205}, url = {https://advances.sciencemag.org/lookup/doi/10.1126/sciadv.abd4205}, author = {Vinken, K. and Boix, X. and Gabriel Kreiman} } @conference {4542, title = {Infants represent {\textquoteright}like-kin{\textquoteright} affiliation }, booktitle = {Budapest Conference on Cognitive Development}, year = {2020}, month = {01/2020}, address = {Budapest, Hungary}, author = {Ashley J. Thomas and Rebecca Saxe and Elizabeth S Spelke} } @article {4809, title = {Infants{\textquoteright} sensitivity to shape changes in 2D visual forms}, journal = {Infancy}, volume = {25}, year = {2020}, month = {09/2020}, pages = {618 - 639}, issn = {1525-0008}, doi = {10.1111/infa.12343}, url = {https://onlinelibrary.wiley.com/toc/15327078/25/5}, author = {Dillon, Moira R. and Izard, V{\'e}ronique and Elizabeth S Spelke} } @article {4600, title = {The inferior temporal cortex is a potential cortical precursor of orthographic processing in untrained monkeys}, journal = {Nature Communications}, volume = {11}, year = {2020}, month = {08/2020}, abstract = {

The ability to recognize written letter strings is foundational to human reading, but the underlying neuronal mechanisms remain largely unknown. Recent behavioral research in baboons suggests that non-human primates may provide an opportunity to investigate this question. We recorded the activity of hundreds of neurons in V4 and the inferior temporal cortex (IT) while na{\"\i}ve macaque monkeys passively viewed images of letters, English words and non-word strings, and tested the capacity of those neuronal representations to support a battery of orthographic processing tasks. We found that simple linear read-outs of IT (but not V4) population responses achieved high performance on all tested tasks, even matching the performance and error patterns of baboons on word classification. These results show that the IT cortex of untrained primates can serve as a precursor of orthographic processing, suggesting that the acquisition of reading in humans relies on the recycling of a brain network evolved for other visual functions.

}, doi = {10.1038/s41467-020-17714-3}, url = {http://www.nature.com/articles/s41467-020-17714-3}, author = {Rishi Rajalingham and Kohitij Kar and Sachi Sanghavi and Dehaene, Stanislas and James J. DiCarlo} } @article {4810, title = {Integrative Benchmarking to Advance Neurally Mechanistic Models of Human Intelligence}, journal = {Neuron}, volume = {108}, year = {2020}, month = {11/2020}, pages = {413 - 423}, issn = {08966273}, doi = {10.1016/j.neuron.2020.07.040}, url = {https://linkinghub.elsevier.com/retrieve/pii/S089662732030605X}, author = {Martin Schrimpf and Kubilius, Jonas and Lee, Michael J. and N. Apurva Ratan Murty and Ajemian, Robert and James J. DiCarlo} } @article {4811, title = {Learning a Natural-language to LTL Executable Semantic Parser for Grounded Robotics}, year = {2020}, month = {12/2020}, institution = {Proceedings of Conference on Robot Learning (CoRL-2020)}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor -- a pretrained end-to-end LTL planner -- must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL; it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, url = {https://corlconf.github.io/paper_385/}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @article {5059, title = {Learning a natural-language to LTL executable semantic parser for grounded robotics}, year = {2020}, month = {08/2020}, abstract = {

Children acquire their native language with apparent ease by observing how language is used in context and attempting to use it themselves. They do so without laborious annotations, negative examples, or even direct corrections. We take a step toward robots that can do the same by training a grounded semantic parser, which discovers latent linguistic representations that can be used for the execution of natural-language commands. In particular, we focus on the difficult domain of commands with a temporal aspect, whose semantics we capture with Linear Temporal Logic, LTL. Our parser is trained with pairs of sentences and executions as well as an executor. At training time, the parser hypothesizes a meaning representation for the input as a formula in LTL. Three competing pressures allow the parser to discover meaning from language. First, any hypothesized meaning for a sentence must be permissive enough to reflect all the annotated execution trajectories. Second, the executor {\textemdash} a pretrained end-to-end LTL planner {\textemdash} must find that the observed trajectories are likely executions of the meaning. Finally, a generator, which reconstructs the original input, encourages the model to find representations that conserve knowledge about the command. Together these ensure that the meaning is neither too general nor too specific. Our model generalizes well, being able to parse and execute both machine-generated and human-generated commands, with near-equal accuracy, despite the fact that the human-generated sentences are much more varied and complex with an open lexicon. The approach presented here is not specific to LTL: it can be applied to any domain where sentence meanings can be hypothesized and an executor can verify these meanings, thus opening the door to many applications for robotic agents.

}, doi = {https://doi.org/10.48550/arXiv.2008.03277}, author = {Christopher Wang and Candace Ross and Yen-Ling Kuo and Boris Katz and Andrei Barbu} } @conference {4702, title = {Learning abstract structure for drawing by efficient motor program induction}, booktitle = {Advances in Neural Information Processing Systems 33 pre-proceedings (NeurIPS 2020)}, year = {2020}, month = {12/2020}, abstract = {

Humans flexibly solve new problems that differ from those previously practiced. This ability to flexibly generalize is supported by learned concepts that represent useful structure common across different problems. Here we develop a naturalistic drawing task to study how humans rapidly acquire structured prior knowledge. The task requires drawing visual figures that share underlying structure, based on a set of composable geometric rules and simple objects. We show that people spontaneously learn abstract drawing procedures that support generalization, and propose a model of how learners can discover these reusable drawing procedures. Trained in the same setting as humans, and constrained to produce efficient motor actions, this model discovers new drawing program subroutines that generalize to test figures and resemble learned features of human behavior. These results suggest that two principles guiding motor program induction in the model - abstraction (programs can reflect high-level structure that ignores figure-specific details) and compositionality (new programs are discovered by recombining previously learned programs) - are key for explaining how humans learn structured internal representations that guide flexible reasoning and learning.

}, url = {https://papers.nips.cc/paper/2020/hash/1c104b9c0accfca52ef21728eaf01453-Abstract.html}, author = {Lucas Tian and Kevin Ellis and Marta Kryven and Joshua B. Tenenbaum} } @conference {4695, title = {Learning Compositional Rules via Neural Program Synthesis}, booktitle = {Advances in Neural Information Processing Systems 33 pre-proceedings (NeurIPS 2020)}, year = {2020}, month = {12/2020}, abstract = {

Many aspects of human reasoning, including language, require learning rules from very little data. Humans can do this, often learning systematic rules from very few examples, and combining these rules to form compositional rule-based systems. Current neural architectures, on the other hand, often fail to generalize in a compositional manner, especially when evaluated in ways that vary systematically from training. In this work, we present a neuro-symbolic model which learns entire rule systems from a small set of examples. Instead of directly predicting outputs from inputs, we train our model to induce the explicit system of rules governing a set of previously seen examples, drawing upon techniques from the neural program synthesis literature. Our rule-synthesis approach outperforms neural meta-learning techniques in three domains: an artificial instruction-learning domain used to evaluate human learning, the SCAN challenge datasets, and learning rule-based translations of number words into integers for a wide range of human languages.

Code can be found at this https URL | arXive entry

}, url = {https://proceedings.neurips.cc/paper/2020/hash/7a685d9edd95508471a9d3d6fcace432-Abstract.html}, author = {Maxwell Nye and Armando Solar-Lezama and Joshua B. Tenenbaum and Brenden M Lake} } @article {4812, title = {Learning from multiple informants: Children{\textquoteright}s response to epistemic bases for consensus judgments}, journal = {Journal of Experimental Child Psychology}, volume = {192}, year = {2020}, month = {04/2020}, pages = {104759}, keywords = {Consensus, Epistemic vigilance, Selective learning, social cognition, Testimony, Young children}, issn = {00220965}, doi = {10.1016/j.jecp.2019.104759}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0022096519303108}, author = {Kim, Sunae and Elizabeth S Spelke} } @article {4655, title = {The logic of universalization guides moral judgment}, journal = {Proceedings of the National Academy of Sciences (PNAS)}, year = {2020}, month = {Feb-10-2020}, pages = {202014505}, abstract = {

To explain why an action is wrong, we sometimes say, {\textquotedblleft}What if everybody did that?{\textquotedblright} In other words, even if a single person{\textquoteright}s behavior is harmless, that behavior may be wrong if it would be harmful once universalized. We formalize the process of universalization in a computational model, test its quantitative predictions in studies of human moral judgment, and distinguish it from alternative models. We show that adults spontaneously make moral judgments consistent with the logic of universalization, and report comparable patterns of judgment in children. We conclude that, alongside other well-characterized mechanisms of moral judgment, such as outcome-based and rule-based thinking, the logic of universalizing holds an important place in our moral minds.

}, issn = {0027-8424}, doi = {10.1073/pnas.2014505117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2014505117}, author = {Levine, Sydney and Max Kleiman-Weiner and Laura Schulz and Joshua B. Tenenbaum and Fiery A Cushman} } @article {4576, title = {Loss landscape: SGD has a better view}, year = {2020}, month = {07/2020}, abstract = {

Consider a loss function ... where f(x) is a deep feedforward network with R layers, no bias terms and scalar output. Assume the network is overparametrized that is, d \>\> n, where d is the number of parameters and n is the number of data points. The networks are assumed to interpolate the training data (e.g. the minimum of L is zero). If GD converges, it will converge to a critical point of L, namely a solution of ... There are two kinds of critical points - those for which each term of the above sum vanishes individually, and those for which the expression only vanishes when all the terms are summed. The main claim in this note is that while GD can converge to both types of critical points, SGD can only converge to the first kind, which include all global minima.

See image below for full formulas.

}, author = {Tomaso Poggio and Yaim Cooper} } @article {4458, title = {Minimal videos: Trade-off between spatial and temporal information in human and machine vision.}, journal = {Cognition}, year = {2020}, month = {08/2020}, abstract = {

Objects and their parts can be visually recognized from purely spatial or purely temporal information but the mechanisms integrating space and time are poorly understood. Here we show that visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by identifying minimal videos: these are short and tiny video clips in which objects, parts, and actions can be reliably recognized, but any reduction in either space or time makes them unrecognizable. Human recognition in minimal videos is invariably accompanied by full interpretation of the internal components of the video. State-of-the-art deep convolutional networks for dynamic recognition cannot replicate human behavior in these configurations. The gap between human and machine vision demonstrated here is due to critical mechanisms for full spatiotemporal interpretation that are lacking in current computational models.

}, keywords = {Comparing deep neural networks and humans, Integration of spatial and temporal visual information, minimal images, Minimal videos, Visual dynamic recognition}, doi = {10.1016/j.cognition.2020.104263}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0010027720300822}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @mastersthesis {4451, title = {Nature and origins of intuitive psychology in human infants}, year = {2020}, author = {Shari Liu} } @article {4504, title = {The neural mechanisms of face processing: cells, areas, networks, and models}, journal = {Current Opinion in Neurobiology}, volume = {60}, year = {2020}, month = {02/2020}, pages = {184 - 191}, abstract = {

Since its discovery, the face-processing network in the brain of the macaque monkey has emerged as a model system that allowed for major neural mechanisms of face recognition to be identified {\textendash} with implications for object recognition at large. Populations of face cells encode faces through broad tuning curves, whose shapes change over time. Face representations differ qualitatively across faces areas, and we not only understand the global organization of these specializations, but also some of the transformations between face areas, both feed-forward and feed-back, and the computational principles behind face representations and transformations. Facial information is combined with physical features and mnemonic features in extensions of the core network, which forms an early part of the primate social brain.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.12.007}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438819301424}, author = {W. A. Freiwald} } @article {4499, title = {A neural network trained for prediction mimics diverse features of biological neurons and perception}, journal = {Nature Machine Intelligence}, volume = {2}, year = {2020}, month = {04/2020}, pages = {210 - 219}, abstract = {

Recent work has shown that convolutional neural networks (CNNs) trained on image recognition tasks can serve as valuable models for predicting neural responses in primate visual cortex. However, these models typically require biologically infeasible levels of labelled training data, so this similarity must at least arise via different paths. In addition, most popular CNNs are solely feedforward, lacking a notion of time and recurrence, whereas neurons in visual cortex produce complex time-varying responses, even to static inputs. Towards addressing these inconsistencies with biology, here we study the emergent properties of a recurrent generative network that is trained to predict future video frames in a self-supervised manner. Remarkably, the resulting model is able to capture a wide variety of seemingly disparate phenomena observed in visual cortex, ranging from single-unit response dynamics to complex perceptual motion illusions, even when subjected to highly impoverished stimuli. These results suggest potentially deep connections between recurrent predictive neural network models and computations in the brain, providing new leads that can enrich both fields.

}, doi = {10.1038/s42256-020-0170-9}, url = {http://www.nature.com/articles/s42256-020-0170-9}, author = {William Lotter and Gabriel Kreiman and Cox, David} } @article {4461, title = {A neural network trained to predict future video frames mimics critical properties of biological neuronal responses and perception.}, journal = {Nature Machine Learning}, year = {2020}, month = {04/2020}, abstract = {

While deep neural networks take loose inspiration from neuroscience, it is an open question how seriously to take the analogies between artificial deep networks and biological neuronal systems. Interestingly, recent work has shown that deep convolutional neural networks (CNNs) trained on large-scale image recognition tasks can serve as strikingly good models for predicting the responses of neurons in visual cortex to visual stimuli, suggesting that analogies between artificial and biological neural networks may be more than superficial. However, while CNNs capture key properties of the average responses of cortical neurons, they fail to explain other properties of these neurons. For one, CNNs typically require large quantities of labeled input data for training. Our own brains, in contrast, rarely have access to this kind of supervision, so to the extent that representations are similar between CNNs and brains, this similarity must arise via different training paths. In addition, neurons in visual cortex produce complex time-varying responses even to static inputs, and they dynamically tune themselves to temporal regularities in the visual environment. We argue that these differences are clues to fundamental differences between the computations performed in the brain and in deep networks. To begin to close the gap, here we study the emergent properties of a previously- described recurrent generative network that is trained to predict future video frames in a self-supervised manner. Remarkably, the model is able to capture a wide variety of seemingly disparate phenomena observed in visual cortex, ranging from single unit response dynamics to complex perceptual motion illusions. These results suggest potentially deep connections between recurrent predictive neural network models and the brain, providing new leads that can enrich both fields.

}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {4814, title = {Online Developmental Science to Foster Innovation, Access, and Impact}, journal = {Trends in Cognitive Sciences}, volume = {24}, year = {2020}, month = {09/2020}, pages = {675 - 678}, issn = {13646613}, doi = {10.1016/j.tics.2020.06.004}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661320301455}, author = {Sheskin, Mark and Scott, Kimberly and Mills, Candice M. and Bergelson, Elika and Bonawitz, Elizabeth and Elizabeth S Spelke and Fei-Fei, Li and Keil, Frank C. and Gweon, Hyowon and Joshua B. Tenenbaum and Julian Jara-Ettinger and Adolph, Karen E. and Rhodes, Marjorie and Frank, Michael C. and Mehr, Samuel A. and Laura Schulz} } @article {4815, title = {Origin of perseveration in the trade-off between reward and complexity}, journal = {Cognition}, volume = {204}, year = {2020}, month = {11/2020}, pages = {104394}, abstract = {

When humans and other animals make repeated choices, they tend to repeat previously chosen actions independently of their reward history. This paper locates the origin of perseveration in a trade-off between two computational goals: maximizing rewards and minimizing the complexity of the action policy. We develop an information-theoretic formalization of policy complexity and show how optimizing the trade-off leads to perseveration. Analysis of two data sets reveals that people attain close to optimal trade-offs. Parameter estimation and model comparison supports the claim that perseveration quantitatively agrees with the theoretically predicted functional form (a softmax function with a frequency-dependent action bias).

}, keywords = {Decision making, Information theory, reinforcement learning}, issn = {00100277}, doi = {10.1016/j.cognition.2020.104394}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027720302134}, author = {Samuel J Gershman} } @article {4679, title = {An Overview of Some Issues in the Theory of Deep Networks}, journal = {IEEJ Transactions on Electrical and Electronic Engineering}, volume = {15}, year = {2020}, month = {10/2020}, pages = {1560 - 1571}, abstract = {

During the last few years, significant progress has been made in the theoretical understanding of deep networks. We review our contributions in the areas of approximation theory and optimization. We also introduce a new approach based on cross-validation leave-one-out stability to estimate bounds on the expected error of overparametrized classifiers, such as deep networks.

}, issn = {1931-4973}, doi = {10.1002/tee.23243}, url = {https://onlinelibrary.wiley.com/toc/19314981/15/11}, author = {Tomaso Poggio and Andrzej Banburski} } @conference {4700, title = {PHASE: PHysically-grounded Abstract Social Eventsfor Machine Social Perception}, booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020}, year = {2020}, month = {12/2020}, abstract = {

The ability to perceive and reason about social interactions in the context ofphysical environments is core to human social intelligence and human-machinecooperation. However, no prior dataset or benchmark has systematically evaluatedphysically grounded perception of complex social interactions that go beyondshort actions, such as high-fiving, or simple group activities, such as gathering.In this work, we create a dataset of physically-grounded abstract social events,PHASE, that resemble a wide range of real-life social interactions by includingsocial concepts such as helping another agent. PHASE consists of 2D animationsof pairs of agents moving in a continuous space generated procedurally using aphysics engine and a hierarchical planner. Agents have a limited field of view, andcan interact with multiple objects, in an environment that has multiple landmarksand obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating thathumans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASEcan serve as a difficult new challenge for developing new models that can recognize complex social interactions.

}, url = {https://openreview.net/forum?id=_bokm801zhx}, author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum} } @article {4456, title = {Putting visual object recognition in context}, journal = {CVPR 2020}, year = {2020}, month = {01/2020}, abstract = {

Context plays an important role in visual recognition. Recent studies have shown that visual recognition networks can be fooled by placing objects in inconsistent contexts (e.g. a cow in the ocean). To understand and model the role of contextual information in visual recognition, we systematically and quantitatively investigated ten critical properties of where, when, and how context modulates recognition including amount of context, context and object resolution, geometrical structure of context, context congruence, time required to incorporate contextual information, and temporal dynamics of contextual modulation. The tasks involve recognizing a target object surrounded with context in a natural image. As an essential benchmark, we first describe a series of psychophysics experiments, where we alter one aspect of context at a time, and quantify human recognition accuracy. To computationally assess performance on the same tasks, we propose a biologically inspired context aware object recognition model consisting of a two-stream architecture. The model processes visual information at the fovea and periphery in parallel, dynamically incorporates both object and contextual information, and sequentially reasons about the class label for the target object. Across a wide range of behavioral tasks, the model approximates human level performance without retraining for each task, captures the dependence of context enhancement on image properties, and provides initial steps towards integrating scene and object information for visual recognition.

}, author = {Zhang, Mengmi and Tseng, Claire and Gabriel Kreiman} } @article {4683, title = {Rapid trial-and-error learning with simulation supports flexible tool use and physical reasoning}, journal = {Proceedings of the National Academy of Sciences}, year = {2020}, month = {11/2021}, pages = {201912341}, abstract = {

Many animals, and an increasing number of artificial agents, display sophisticated capabilities to perceive and manipulate objects. But human beings remain distinctive in their capacity for flexible, creative tool use{\textemdash}using objects in new ways to act on the world, achieve a goal, or solve a problem. To study this type of general physical problem solving, we introduce the Virtual Tools game. In this game, people solve a large range of challenging physical puzzles in just a handful of attempts. We propose that the flexibility of human physical problem solving rests on an ability to imagine the effects of hypothesized actions, while the efficiency of human search arises from rich action priors which are updated via observations of the world. We instantiate these components in the {\textquotedblleft}sample, simulate, update{\textquotedblright} (SSUP) model and show that it captures human performance across 30 levels of the Virtual Tools game. More broadly, this model provides a mechanism for explaining how people condense general physical knowledge into actionable, task-specific plans to achieve flexible and efficient physical problem solving.

}, keywords = {intuitive physics, physical problem solving, tool use}, issn = {0027-8424}, doi = {10.1073/pnas.1912341117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1912341117}, author = {Kelsey Allen and Kevin A Smith and Joshua B. Tenenbaum} } @article {4553, title = {Response patterns in the developing social brain are organized by social and emotion features and disrupted in children diagnosed with autism spectrum disorder}, journal = {Cortex}, volume = {125}, year = {2020}, month = {Jan-04-2020}, pages = {12 - 29}, issn = {00109452}, doi = {10.1016/j.cortex.2019.11.021}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31958654}, author = {Richardson, Hilary and Hyowon Gweon and Dodell-Feder, David and Malloy, Caitlin and Pelton, Hannah and Keil, Boris and Nancy Kanwisher and Rebecca Saxe} } @article {4420, title = {Scale and translation-invariance for novel objects in human vision}, journal = {Scientific Reports}, volume = {10}, year = {2020}, month = {01/2020}, abstract = {

Though the range of invariance in recognition of novel objects is a basic aspect of human vision, its characterization has remained surprisingly elusive. Here we report tolerance to scale and position changes in one-shot learning by measuring recognition accuracy of Korean letters presented in a flash to non-Korean subjects who had no previous experience with Korean letters. We found that humans have significant scale-invariance after only a single exposure to a novel object. The range of translation-invariance is limited, depending on the size and position of presented objects. to understand the underlying brain computation associated with the invariance properties, we compared experimental data with computational modeling results. our results suggest that to explain invariant recognition of objects by humans, neural network models should explicitly incorporate built-in scale-invariance, by encoding different scale channels as well as eccentricity-dependent representations captured by neurons{\textquoteright} receptive field sizes and sampling density that change with eccentricity. Our psychophysical experiments and related simulations strongly suggest that the human visual system uses a computational strategy that differs in some key aspects from current deep learning architectures, being more data efficient and relying more critically on eye-movements.

}, doi = {10.1038/s41598-019-57261-6}, url = {http://www.nature.com/articles/s41598-019-57261-6}, author = {Yena Han and Gemma Roig and Geiger, Gad and Tomaso Poggio} } @conference {4539, title = {Segregation from Noise as Outlier Detection }, booktitle = {Association for Research in Otolaryngology}, year = {2020}, month = {01/2020}, address = {San Jose, CA, USA}, author = {Jarrod M Hicks and Josh H. McDermott} } @proceedings {4692, title = {Simulating a Primary Visual Cortex at the Front of CNNs Improves Robustness to Image Perturbations}, year = {2020}, month = {12/2020}, abstract = {

Current state-of-the-art object recognition models are largely based on convolutional neural network (CNN) architectures, which are loosely inspired by the primate visual system. However, these CNNs can be fooled by imperceptibly small, explicitly crafted perturbations, and struggle to recognize objects in corrupted images that are easily recognized by humans. Here, by making comparisons with primate neural data, we first observed that CNN models with a neural hidden layer that better matches primate primary visual cortex (V1) are also more robust to adversarial attacks. Inspired by this observation, we developed VOneNets, a new class of hybrid CNN vision models. Each VOneNet contains a fixed weight neural network front-end that simulates primate V1, called the VOneBlock, followed by a neural network back-end adapted from current CNN vision models. The VOneBlock is based on a classical neuroscientific model of V1: the linear-nonlinear-Poisson model, consisting of a biologically-constrained Gabor filter bank, simple and complex cell nonlinearities, and a V1 neuronal stochasticity generator. After training, VOneNets retain high ImageNet performance, but each is substantially more robust, outperforming the base CNNs and state-of-the-art methods by 18\% and 3\%, respectively, on a conglomerate benchmark of perturbations comprised of white box adversarial attacks and common image corruptions. Finally, we show that all components of the VOneBlock work in synergy to improve robustness. While current CNN architectures are arguably brain-inspired, the results presented here demonstrate that more precisely mimicking just one stage of the primate visual system leads to new gains in ImageNet-level computer vision applications.

Github: https://github.com/dicarlolab/vonenet

}, url = {https://proceedings.neurips.cc/paper/2020/hash/98b17f068d5d9b7668e19fb8ae470841-Abstract.html}, author = {Joel Dapello and Tiago Marques and Martin Schrimpf and Franziska Geiger and David Cox and James J. DiCarlo} } @article {4816, title = {Social interaction networks in the primate brain}, journal = {Current Opinion in Neurobiology}, volume = {65}, year = {2020}, month = {12/2020}, pages = {49 - 58}, abstract = {

Primate brains have evolved to understand and engage with their social world. Much about the structure of this world can be gleaned from social interactions. Circuits for the analysis of and participation in social interactions have now been mapped. Increased knowledge about their functional specializations and relative spatial locations promises to greatly improve the understanding of the functional organization of the primate social brain. Detailed electrophysiology, as in the case of the face-processing network, of local operations and functional interactions between areas is necessary to uncover neural mechanisms and computation principles of social cognition. New naturalistic behavioral paradigms, behavioral tracking, and new analytical approaches for parallel non-stationary data will be important components toward a neuroscientific theory of primates{\textquoteright} interactive minds.

}, issn = {09594388}, doi = {10.1016/j.conb.2020.08.012}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438820301252}, author = {W. A. Freiwald} } @article {4552, title = {The speed of human social interaction perception}, journal = {NeuroImage}, year = {2020}, month = {Jan-04-2020}, pages = {116844}, abstract = {

The ability to perceive others{\textquoteright} social interactions, here defined as the directed contingent actions between two or more people, is a fundamental part of human experience that develops early in infancy and is shared with other primates. However, the neural computations underlying this ability remain largely unknown. Is social interaction recognition a rapid feedforward process or a slower post-perceptual inference? Here we used magnetoencephalography (MEG) decoding to address this question. Subjects in the MEG viewed snapshots of visually matched real-world scenes containing a pair of people who were either engaged in a social interaction or acting independently. The presence versus absence of a social interaction could be read out from subjects{\textquoteright} MEG data spontaneously, even while subjects performed an orthogonal task. This readout generalized across different people and scenes, revealing abstract representations of social interactions in the human brain. These representations, however, did not come online until quite late, at 300 ms after image onset, well after feedforward visual processes. In a second experiment, we found that social interaction readout still occurred at this same late latency even when subjects performed an explicit task detecting social interactions. We further showed that MEG responses distinguished between different types of social interactions (mutual gaze vs joint attention) even later, around 500 ms after image onset. Taken together, these results suggest that the human brain spontaneously extracts information about others{\textquoteright} social interactions, but does so slowly, likely relying on iterative top-down computations.

}, issn = {10538119}, doi = {10.1016/j.neuroimage.2020.116844}, url = {https://www.ncbi.nlm.nih.gov/pubmed/32302763}, author = {Leyla Isik and Mynick, Anna and Pantazis, Dimitrios and Nancy Kanwisher} } @article {4469, title = {Stable Foundations for Learning: a framework for learning theory (in both the classical and modern regime).}, year = {2020}, month = {03/2020}, abstract = {

We consider here the class of supervised learning algorithms known as Empirical Risk Minimization (ERM). The classical theory by Vapnik and others characterize universal consistency of ERM in the classical regime in which the architecture of the learning network is fixed and n, the number of training examples, goes to infinity. We do not have a similar general theory for the modern regime of interpolating regressors and overparameterized deep networks, in which d \> n as n goes to infinity.

In this note I propose the outline of such a theory based on the specific notion of CVloo stability of the learning algorithm with respect to perturbations of the training set. The theory shows that for interpolating regressors and separating classifiers (either kernel machines or deep RELU networks)

  1. minimizing CVloo stability minimizes the expected error
  2. \ minimum norm solutions are\ the most stable solutions

The hope is that this approach may lead to a unified theory encompassing both the modern regime and the classical one.

}, author = {Tomaso Poggio} } @conference {4528, title = {Temporal information for action recognition only needs to be integrated at a choice level in neural networks and primates }, booktitle = {COSYNE}, year = {2020}, month = {02/2020}, address = {Denver, CO, USA}, author = {Martin Schrimpf and Fukushi Sato and Sachi Sanghavi and James J. DiCarlo} } @article {4565, title = {Theoretical issues in deep networks}, journal = {Proceedings of the National Academy of Sciences}, year = {2020}, month = {Sep-06-2020}, pages = {201907369}, abstract = {

While deep learning is successful in a number of applications, it is not yet well understood theoretically. A theoretical characterization of deep learning should answer questions about their approximation power, the dynamics of optimization, and good out-of-sample performance, despite overparameterization and the absence of explicit regularization. We review our recent results toward this goal. In approximation theory both shallow and deep networks are known to approximate any continuous functions at an exponential cost. However, we proved that for certain types of compositional functions, deep networks of the convolutional type (even without weight sharing) can avoid the curse of dimensionality. In characterizing minimization of the empirical exponential loss we consider the gradient flow of the weight directions rather than the weights themselves, since the relevant function underlying classification corresponds to normalized networks. The dynamics of normalized weights turn out to be equivalent to those of the constrained problem of minimizing the loss subject to a unit norm constraint. In particular, the dynamics of typical gradient descent have the same critical points as the constrained problem. Thus there is implicit regularization in training deep networks under exponential-type loss functions during gradient flow. As a consequence, the critical points correspond to minimum norm minimizers. This result is especially relevant because it has been recently shown that, for overparameterized models, selection of a minimum norm solution optimizes cross-validation leave-one-out stability and thereby the expected error. Thus our results imply that gradient descent in deep networks minimize the expected error.

}, issn = {0027-8424}, doi = {10.1073/pnas.1907369117}, url = {https://www.pnas.org/content/early/2020/06/08/1907369117}, author = {Tomaso Poggio and Andrzej Banburski and Qianli Liao} } @article {4501, title = {A theory of learning to infer.}, journal = {Psychological Review}, volume = {127}, year = {2020}, month = {04/2020}, pages = {412 - 441}, abstract = {

Bayesian theories of cognition assume that people can integrate probabilities rationally. However, several empirical findings contradict this proposition: human probabilistic inferences are prone to systematic deviations from optimality. Puzzlingly, these deviations sometimes go in opposite directions. Whereas some studies suggest that people underreact to prior probabilities (base rate neglect), other studies find that people underreact to the likelihood of the data (conservatism). We argue that these deviations arise because the human brain does not rely solely on a general-purpose mechanism for approximating Bayesian inference that is invariant across queries. Instead, the brain is equipped with a recognition model that maps queries to probability distributions. The parameters of this recognition model are optimized to get the output as close as possible, on average, to the true posterior. Because of our limited computational resources, the recognition model will allocate its resources so as to be more accurate for high probability queries than for low probability queries. By adapting to the query distribution, the recognition model learns to infer. We show that this theory can explain why and when people underreact to the data or the prior, and a new experiment demonstrates that these two forms of underreaction can be systematically controlled by manipulating the query distribution. The theory also explains a range of related phenomena: memory effects, belief bias, and the structure of response variability in probabilistic reasoning. We also discuss how the theory can be integrated with prior sampling-based accounts of approximate inference.

}, issn = {0033-295X}, doi = {10.1037/rev0000178}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/rev0000178}, author = {Ishita Dasgupta and Eric Schulz and Joshua B. Tenenbaum and Samuel J Gershman} } @article {4632, title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation}, journal = {arXiv}, year = {2020}, month = {07/2020}, type = {Preprint}, abstract = {

We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.

}, url = {https://arxiv.org/abs/2007.04954}, author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins} } @article {4633, title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation}, year = {2020}, month = {07/2020}, abstract = {

TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology

A TDW simulation consists of two components: a) the Build, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the Controller, an external Python interface to communicate with the build.

Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.

TDW provides researchers with:

TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.

Find out more about ThreeDWorld on the project weobsite using the link below.

}, url = {http://www.threedworld.org/}, author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan} } @article {4819, title = {Time-dependent discrimination advantages for harmonic sounds suggest efficient coding for memory}, journal = {Proceedings of the National Academy of Sciences}, volume = {117}, year = {2020}, month = {12/2020}, pages = {32169 - 32180}, abstract = {

Perceptual systems have finite memory resources and must store incoming signals in compressed formats. To explore whether representations of a sound{\textquoteright}s pitch might derive from this need for compression, we compared discrimination of harmonic and inharmonic sounds across delays. In contrast to inharmonic spectra, harmonic spectra can be summarized, and thus compressed, using their fundamental frequency (f0). Participants heard two sounds and judged which was higher. Despite being comparable for sounds presented back-to-back, discrimination was better for harmonic than inharmonic stimuli when sounds were separated in time, implicating memory representations unique to harmonic sounds. Patterns of individual differences (correlations between thresholds in different conditions) indicated that listeners use different representations depending on the time delay between sounds, directly comparing the spectra of temporally adjacent sounds, but transitioning to comparing f0s across delays. The need to store sound in memory appears to determine reliance on f0-based pitch, and may explain its importance in music, in which listeners must extract relationships between notes separated in time.

}, issn = {0027-8424}, doi = {10.1073/pnas.2008956117}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.2008956117}, author = {McPherson, Malinda J. and Josh H. McDermott} } @conference {4534, title = {Toward human-like object naming in artificial neural systems }, booktitle = {International Conference on Learning Representations (ICLR 2020), Bridging AI and Cognitive Science workshop}, year = {2020}, month = {04/2020}, address = {Virtual conference (due to Covid-19)}, author = {Tiwalayo Eisape and Roger Levy and Joshua B. Tenenbaum and Noga Zaslavsky} } @conference {4525, title = {Using task-optimized neural networks to understand why brains have specialized processing for faces }, booktitle = {Computational and Systems Neurosciences}, year = {2020}, month = {02/2020}, address = {Denver, CO, USA}, author = {Dobs, Katharina and Alexander J. E. Kell and Julio Martinez-Trujillo and Michael Cohen and Nancy Kanwisher} } @conference {4558, title = {What can human minimal videos tell us about dynamic recognition models?}, booktitle = {International Conference on Learning Representations (ICLR 2020)}, year = {2020}, month = {04/2020}, address = {Virtual Conference}, abstract = {

In human vision objects and their parts can be visually recognized from purely spatial or purely temporal information but the mechanisms integrating space and time are poorly understood. Here we show that human visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by identifying minimal videos: these are short and tiny video clips in which objects, parts, and actions can be reliably recognized, but any reduction in either space or time makes them unrecognizable. State-of-the-art deep networks for dynamic visual recognition cannot replicate human behavior in these configurations. This gap between humans and machines points to critical mechanisms in human dynamic vision that are lacking in current models.

Published as a workshop paper at {\textquotedblleft}Bridging AI and Cognitive Science{\textquotedblright} (ICLR 2020)

}, url = {https://baicsworkshop.github.io/pdf/BAICS_1.pdf}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @conference {4524, title = {Why Are Face and Object Processing Segregated in the Human Brain? Testing Computational Hypotheses with Deep Convolutional Neural Networks }, booktitle = {Conference on Cognitive Computational Neuroscience}, year = {2020}, month = {09/2020}, address = {Berlin, Germany}, author = {Dobs, Katharina and Alexander J. E. Kell and Julio Martinez-Trujillo and Michael Cohen and Nancy Kanwisher} } @article {4657, title = {XDream: Finding preferred stimuli for visual neurons using generative networks and gradient-free optimization}, journal = {PLOS Computational Biology}, volume = {16}, year = {2020}, month = {06/2020}, pages = {e1007973}, abstract = {

A longstanding question in sensory neuroscience is what types of stimuli drive neurons to fire. The characterization of effective stimuli has traditionally been based on a combination of intuition, insights from previous studies, and luck. A new method termed XDream (EXtending DeepDream with real-time evolution for activation maximization) combined a generative neural network and a genetic algorithm in a closed loop to create strong stimuli for neurons in the macaque visual cortex. Here we extensively and systematically evaluate the performance of XDream. We use ConvNet units as in silico models of neurons, enabling experiments that would be prohibitive with biological neurons. We evaluated how the method compares to brute-force search, and how well the method generalizes to different neurons and processing stages. We also explored design and parameter choices. XDream can efficiently find preferred features for visual units without any prior knowledge about them. XDream extrapolates to different layers, architectures, and developmental regimes, performing better than brute-force search, and often better than exhaustive sampling of \>1 million images. Furthermore, XDream is robust to choices of multiple image generators, optimization algorithms, and hyperparameters, suggesting that its performance is locally near-optimal. Lastly, we found no significant advantage to problem-specific parameter tuning. These results establish expectations and provide practical recommendations for using XDream to investigate neural coding in biological preparations. Overall, XDream is an efficient, general, and robust algorithm for uncovering neuronal tuning preferences using a vast and diverse stimulus space. XDream is implemented in Python, released under the MIT License, and works on Linux, Windows, and MacOS.

}, doi = {10.1371/journal.pcbi.1007973}, url = {https://dx.plos.org/10.1371/journal.pcbi.1007973}, author = {Will Xiao and Gabriel Kreiman}, editor = {Fyshe, Alona} } @conference {4546, title = {Analysis of Macaque Monkeys{\textquoteright} Social and Physical Interaction Processing with Eye tracking Data}, booktitle = {The Rockefeller University 2019 Summer Science Research Program (SSRP)}, year = {2019}, month = {08/2019}, address = {New York, NY, USA}, author = {Yutong Zhang and Marciniak, Karolina and W. A. Freiwald} } @article {4240, title = {An analysis of training and generalization errors in shallow and deep networks}, year = {2019}, month = {05/2019}, abstract = {

This paper is motivated by an open problem around deep networks, namely, the apparent absence of overfitting despite large over-parametrization which allows perfect fitting of the training data. In this paper, we analyze this phenomenon in the case of regression problems when each unit evaluates a periodic activation function. We argue that the minimal expected value of the square loss is inappropriate to measure the generalization error in approximation of compositional functions in order to take full advantage of the compositional structure. Instead, we measure the generalization error in the sense of maximum loss, and sometimes, as a pointwise error. We give estimates on exactly how many parameters ensure both zero training error as well as a good generalization error. We prove that a solution of a regularization problem is guaranteed to yield a good training error as well as a good generalization error and estimate how much error to expect at which test data.

}, keywords = {deep learning, generalization error, interpolatory approximation}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @conference {4323, title = {Are topographic deep convolutional neural networks better models of the ventral visual stream?}, booktitle = {Conference on Cognitive Computational Neuroscience}, year = {2019}, author = {K.M. Jozwik and Lee, H. and Nancy Kanwisher and James J. DiCarlo} } @proceedings {4385, title = {Beating SGD Saturation with Tail-Averaging and Minibatching}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

While stochastic gradient descent (SGD) is one of the major workhorses in machine learning, the learning properties of many practically used variants are still poorly understood. In this paper, we consider least squares learning in a nonparametric setting and contribute to filling this gap by focusing on the effect and interplay of multiple passes, mini-batching and averaging, in particular tail averaging. Our results show how these different variants of SGD can be combined to achieve optimal learning rates, also providing practical insights. A novel key result is that tail averaging allows faster convergence rates than uniform averaging in the nonparametric setting. Further, we show that a combination of tail-averaging and minibatching allows more aggressive step-size choices than using any one of said components.

}, author = {Nicole Muecke and Gergely Neu and Lorenzo Rosasco} } @conference {4460, title = {Biologically-plausible learning algorithms can scale to large datasets.}, booktitle = { International Conference on Learning Representations, (ICLR 2019)}, year = {2019}, abstract = {

The backpropagation (BP) algorithm is often thought to be biologically implau- sible in the brain. One of the main reasons is that BP requires symmetric weight matrices in the feedforward and feedback pathways. To address this {\textquotedblleft}weight transport problem{\textquotedblright} (Grossberg, 1987), two biologically-plausible algorithms, pro- posed by Liao et al. (2016) and Lillicrap et al. (2016), relax BP{\textquoteright}s weight sym- metry requirements and demonstrate comparable learning capabilities to that of BP on small datasets. However, a recent study by Bartunov et al. (2018) finds that although feedback alignment (FA) and some variants of target-propagation (TP) perform well on MNIST and CIFAR, they perform significantly worse than BP on ImageNet. Here, we additionally evaluate the sign-symmetry (SS) algo- rithm (Liao et al., 2016), which differs from both BP and FA in that the feedback and feedforward weights do not share magnitudes but share signs. We examined the performance of sign-symmetry and feedback alignment on ImageNet and MS COCO datasets using different network architectures (ResNet-18 and AlexNet for ImageNet; RetinaNet for MS COCO). Surprisingly, networks trained with sign- symmetry can attain classification performance approaching that of BP-trained networks. These results complement the study by Bartunov et al. (2018) and es- tablish a new benchmark for future biologically-plausible learning algorithms on more difficult datasets and more complex architectures.

}, author = {Will Xiao and Chen, Honglin and Qianli Liao and Tomaso Poggio} } @article {4262, title = {Blind Constant Modulus Multiuser Detection via Low-Rank Approximation}, journal = {IEEE Signal Processing Letters}, year = {2019}, month = {01/2019}, pages = {1 - 1}, abstract = {

We present a novel convex-optimization-based solution to blind linear multiuser detection in direct-sequence CDMA systems. The solution is based on a convex low-rank approximation of the linearly constrained constant modulus cost function, thus guaranteeing its global minimization. Further, it can be cast as a semidefinite program, implying that it can be solved using interior-point techniques with polynomial time complexity. The solution is parameter free and is shown to be superior to existing solutions in terms of output SINR and BER, especially for a small number of samples.

}, issn = {1070-9908}, doi = {10.1109/LSP.9710.1109/LSP.2019.2918001}, url = {https://ieeexplore.ieee.org/document/8718546}, author = {Amir Adler and Wax, Mati} } @article {4296, title = {Brain Signals Localization by Alternating Projections}, year = {2019}, month = {08/2019}, abstract = {

We present a novel solution to the problem of localization of brain signals. The solution is sequential and iterative, and is based on minimizing the least-squares (LS) criterion by the alternating projection (AP) algorithm, well known in the context of array signal processing. Unlike existing solutions belonging to the linearly constrained minimum variance (LCMV) and to the multiple-signal classification (MUSIC) families, the algorithm is applicable even in the case of a single sample and in the case of synchronous sources. The performance of the solution is demonstrated via simulations.

}, author = {Amir Adler and Mati Wax and Pantazis, Dimitrios} } @proceedings {4379, title = {Brain-Like Object Recognition with High-Performing Shallow Recurrent ANNs}, year = {2019}, month = {10/2019}, address = {Vancouver, Canada}, abstract = {

Deep convolutional artificial neural networks (ANNs) are the leading class of candidate models of the mechanisms of visual processing in the primate ventral stream. While initially inspired by brain anatomy, over the past years, these ANNs have evolved from a simple eight-layer architecture in AlexNet to extremely deep and branching architectures, demonstrating increasingly better object categorization performance, yet bringing into question how brain-like they still are. In particular, typical deep models from the machine learning community are often hard to map onto the brain{\textquoteright}s anatomy due to their vast number of layers and missing biologically-important connections, such as recurrence. Here we demonstrate that better anatomical alignment to the brain and high performance on machine learning as well as neuroscience measures do not have to be in contradiction. We developed CORnet-S, a shallow ANN with four anatomically mapped areas and recurrent connectivity, guided by Brain-Score, a new large-scale composite of neural and behavioral benchmarks for quantifying the functional fidelity of models of the primate ventral visual stream. Despite being significantly shallower than most models, CORnet-S is the top model on Brain-Score and outperforms similarly compact models on ImageNet. Moreover, our extensive analyses of CORnet-S circuitry variants reveal that recurrence is the main predictive factor of both Brain- Score and ImageNet top-1 performance. Finally, we report that the temporal evolution of the CORnet-S "IT" neural population resembles the actual monkey IT population dynamics. Taken together, these results establish CORnet-S, a compact, recurrent ANN, as the current best model of the primate ventral visual stream.

}, author = {Jonas Kubilius and Martin Schrimpf and Kohitij Kar and Rishi Rajalingham and Ha Hong and Najib J. Majaj and Elias B. Issa and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Aran Nayebi and Daniel Bear and Daniel L K Yamins and James J. DiCarlo} } @article {4514, title = {Choosing a Transformative Experience }, year = {2019}, month = {07/2019}, author = {Marta Kryven and Niemi, L. and Paul, L. and Joshua B. Tenenbaum} } @article {4110, title = {Constant modulus algorithms via low-rank approximation}, journal = {Signal Processing}, volume = {160}, year = {2019}, month = {07/2019}, pages = {263 - 270}, abstract = {

We present a novel convex-optimization-based approach to the solutions of a family of problems involving constant modulus signals. The family of problems includes the constant modulus and the constrained constant modulus, as well as the modified constant modulus and the constrained modified constant modulus. These solutions are shown to constitute semidefinite programs (SDP), thus enabling efficient interior-point methods with polynomial time complexity. The performance\  of the proposed solutions, demonstrated in several simulated experiments for the task of blind beamforming, is shown to be superior to existing methods.

}, issn = {01651684}, doi = {10.1016/j.sigpro.2019.02.007}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0165168419300568}, author = {Amir Adler and Wax, Mati} } @conference {4188, title = {Data for free: Fewer-shot algorithm learning with parametricity data augmentation}, booktitle = {ICLR 2019}, year = {2019}, month = {04/2019}, abstract = {

We address the problem of teaching an RNN to approximate list-processing algorithms given a small number of input-output training examples. Our approach is to generalize the idea of parametricity from programming language theory to formulate a semantic property that distinguishes common algorithms from arbitrary non-algorithmic functions. This characterization leads naturally to a learned data augmentation scheme that encourages RNNs to learn algorithmic behavior and enables small-sample learning in a variety of list-processing tasks.

}, author = {Owen Lewis and Katherine Hermann} } @article {4511, title = {Deep Compositional Robotic Planners that Follow Natural Language Commands.}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Centre, Vancouver, Canada}, url = {https://vigilworkshop.github.io/}, author = {Yen-Ling Kuo and Katz, Boris and Andrei Barbu} } @article {4187, title = {Deep neural network models of sensory systems: windows onto the role of task constraints}, journal = {Current Opinion in Neurobiology}, volume = {55}, year = {2019}, month = {01/2019}, pages = {121 - 132}, abstract = {

Sensory neuroscience aims to build models that predict neural responses and perceptual behaviors, and that provide insight into the principles that give rise to them. For decades, artificial neural networks trained to perform perceptual tasks have attracted interest as potential models of neural computation. Only recently, however, have such systems begun to perform at human levels on some real-world tasks. The recent engineering successes of deep learning have led to renewed interest in artificial neural networks as models of the brain. Here we review applications of deep learning to sensory neuroscience, discussing potential limitations and future directions. We highlight the potential uses of deep neural networks to reveal how task performance may constrain neural systems and behavior. In particular, we consider how task-optimized networks can generate hypotheses about neural representations and functional organization in ways that are analogous to traditional ideal observer models.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.02.003}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438818302034}, author = {Alexander J. E. Kell and Josh H. McDermott} } @conference {4242, title = {Deep Recurrent Architectures for Seismic Tomography}, booktitle = {81st EAGE Conference and Exhibition 2019}, year = {2019}, month = {06/2019}, abstract = {

This paper introduces novel deep recurrent neural network architectures for Velocity Model Building (VMB), which is beyond what Araya-Polo et al 2018 pioneered with the Machine Learning-based seismic tomography built with convolutional non-recurrent neural network. Our investigation includes the utilization of basic recurrent neural network (RNN) cells, as well as Long Short Term Memory (LSTM) and Gated Recurrent Unit (GRU) cells. Performance evaluation reveals that salt bodies are consistently predicted more accurately by GRU and LSTM-based architectures, as compared to non-recurrent architectures. The results take us a step closer to the final goal of a reliable fully Machine Learning-based tomography from pre-stack data, which when achieved will reduce the VMB turnaround from weeks to days.

}, author = {Amir Adler and Mauricio Araya-Polo and Tomaso Poggio} } @article {4498, title = {Deep video-to-video transformations for accessibility with an application to photosensitivity}, journal = {Pattern Recognition Letters}, year = {2019}, month = {06/2019}, abstract = {

We demonstrate how to construct a new class of visual assistive technologies that, rather than extract symbolic information, learn to transform the visual environment to make it more accessible. We do so without engineering which transformations are useful allowing for arbitrary modifications of the visual input. As an instantiation of this idea we tackle a problem that affects and hurts millions worldwide: photosensitivity. Any time an affected person opens a website, video, or some other medium that contains an adverse visual stimulus, either intended or unintended, they might experience a seizure with potentially significant consequences. We show how a deep network can learn a video-to-video transformation rendering such stimuli harmless while otherwise preserving the video. This approach uses a specification of the adverse phenomena, the forward transformation, to learn the inverse transformation. We show how such a network generalizes to real-world videos that have triggered numerous seizures, both by mistake and in politically-motivated attacks. A number of complimentary approaches are demonstrated including using a hand-crafted generator and a GAN using a differentiable perceptual metric. Such technology can be deployed offline to protect videos before they are shown or online with assistive glasses or real-time post processing. Other applications of this general technique include helping those with limited vision, attention deficit hyperactivity disorder, and autism.

}, issn = {01678655}, doi = {10.1016/j.patrec.2019.01.019}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0167865519300133}, author = {Andrei Barbu and Banda, Dalitso and Katz, Boris} } @conference {4263, title = {Direct Localization by Partly Calibrated Arrays: A Relaxed Maximum Likelihood Solution}, booktitle = {27th European Signal Processing Conference, EUSIPCO 2019}, year = {2019}, month = {07/2019}, address = {A Coruna, Spain}, abstract = {

We present a novel relaxed maximum likelihood solution to the problem of direct localization of multiple narrowband\  sources by partly calibrated arrays, i.e., arrays composed of\  fully calibrated subarrays yet lacking\  inter-array calibration. The proposed solution is based on eliminating analytically all the nuisance parameters in the problem, thus reducing the likelihood function to a maximization problem involving only the location of the sources.\  The performance of the solution is demonstrated via simulations.

}, url = {http://eusipco2019.org/technical-program}, author = {Amir Adler and Mati Wax} } @conference {4522, title = {Disruption of CA1 Sharp-Wave Ripples by the nonbenzodiazepine hypnotic eszopiclone }, booktitle = {Society for Neuroscience}, year = {2019}, month = {10/2019}, address = {Chicago, IL, USA}, author = {Becker, LA and Hector Penagos and Manoach, DS and Matthew A. Wilson and Varela, Carmen} } @article {4255, title = {Divergence in the functional organization of human and macaque auditory cortex revealed by fMRI responses to harmonic tones}, journal = {Nature Neuroscience}, year = {2019}, month = {06/10/2019}, abstract = {

We report a difference between humans and macaque monkeys in the functional organization of cortical regions implicated in pitch perception. Humans but not macaques showed regions with a strong preference for harmonic sounds compared to noise, measured with both synthetic tones and macaque vocalizations. In contrast, frequency-selective tonotopic maps were similar between the two species. This species difference may be driven by the unique demands of speech and music perception in humans.

}, issn = {1097-6256}, doi = {10.1038/s41593-019-0410-7}, url = {https://www.nature.com/articles/s41593-019-0410-7}, author = {Sam V Norman-Haignere and Nancy Kanwisher and Josh H. McDermott and B. R. Conway} } @article {4512, title = { Does intuitive inference of physical stability interruptattention?}, year = {2019}, month = {07/2019}, author = {Marta Kryven and Scholl, B. and Joshua B. Tenenbaum} } @article {4375, title = {Double descent in the condition number}, year = {2019}, month = {12/2019}, abstract = {

In solving a system of n linear equations in d variables\ \  Ax=b, the condition number of the (n,d) matrix A measures how\ \  much errors in the data b affect the solution x. Bounds of\ \  this type are important in many inverse problems. An example is\ \  machine learning where the key task is to estimate an underlying\ \  function from a set of measurements at random points in a high\ \  dimensional space and where low sensitivity to error in the data is\ \  a requirement for good predictive performance. Here we report the\ \  simple observation that when the columns of A are random vectors,\ \  the condition number of A is highest, that is worse, when d=n,\ \  that is when the inverse of A exists. An overdetermined system\ \  (n\>d) and especially an underdetermined system (n\<d), for which\ \  the pseudoinverse must be used instead of the inverse, typically\ \  have significantly better, that is lower, condition numbers. Thus\ \  the condition number of A plotted as function of d shows a\ \  double descent behavior with a peak at d=n.

}, author = {Tomaso Poggio and Gil Kur and Andrzej Banburski} } @proceedings {4261, title = {Draping an Elephant: Uncovering Children{\textquoteright}s Reasoning About Cloth-Covered Objects}, year = {2019}, month = {07/2019}, address = {Montreal, Canada}, abstract = {

Humans have an intuitive understanding of physics. They can predict how a physical scene will unfold, and reason about how it came to be. Adults may rely on such a physical representation for visual reasoning and recognition, going beyond visual features and capturing objects in terms of their physical properties. Recently, the use of draped objects in recognition was used to examine adult object representations in the absence of many common visual features. In this paper we examine young children{\textquoteright}s reasoning about draped objects in order to examine the develop of physical object representation. In addition, we argue that a better understanding of the development of the concept of cloth as a physical entity is worthwhile in and of itself, as it may form a basic ontological category in intuitive physical reasoning akin to liquids and solids. We use two experiments to investigate young children{\textquoteright}s (ages 3{\textendash}5) reasoning about cloth-covered objects, and find that they perform significantly above chance (though far from perfectly) indicating a representation of physical objects that can interact dynamically with the world. Children{\textquoteright}s success and failure pattern is similar across the two experiments, and we compare it to adult behavior. We find a small effect, which suggests the specific features that make reasoning about certain objects more difficult may carry into adulthood.

}, keywords = {analysis-by-synthesis, cloth, cognitive development, imagination, intuitive physics, object recognition, occlusion, perception, vision}, url = {https://mindmodeling.org/cogsci2019/papers/0506/index.html}, author = {Tomer D Ullman and Eliza Kosoy and Ilker Yildirim and Amir Arsalan Soltani and Max Siegel and Joshua B. Tenenbaum and Elizabeth S Spelke} } @conference {4516, title = {Dynamics \& Generalization in Deep Networks -Minimizing the Norm}, booktitle = {NAS Sackler Colloquium on Science of Deep Learning}, year = {2019}, month = {03/2019}, address = {Washington D.C.}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Jack Hidary and Tomaso Poggio} } @conference {4538, title = {Eccentricity Dependent Neural Network with Recurrent Attention for Scale, Translation and Clutter Invariance }, booktitle = {Vision Science Society}, year = {2019}, month = {05/2019}, address = {Florida, USA}, author = {Jiaxuan Zhang and Yena Han and Tomaso Poggio and Gemma Roig} } @article {4510, title = {Ecological origins of perceptual grouping principles in the auditory system}, journal = {Proceedings of the National Academy of Sciences}, volume = {116}, year = {2019}, month = {12/2019}, pages = {25355 - 25364}, abstract = {

Events and objects in the world must be inferred from sensory signals to support behavior. Because sensory measurements are temporally and spatially local, the estimation of an object or event can be viewed as the grouping of these measurements into representations of their common causes. Perceptual grouping is believed to reflect internalized regularities of the natural environment, yet grouping cues have traditionally been identified using informal observation and investigated using artificial stimuli. The relationship of grouping to natural signal statistics has thus remained unclear, and additional or alternative cues remain possible. Here, we develop a general methodology for relating grouping to natural sensory signals and apply it to derive auditory grouping cues from natural sounds. We first learned local spectrotemporal features from natural sounds and measured their co-occurrence statistics. We then learned a small set of stimulus properties that could predict the measured feature co-occurrences. The resulting cues included established grouping cues, such as harmonic frequency relationships and temporal coincidence, but also revealed previously unappreciated grouping principles. Human perceptual grouping was predicted by natural feature co-occurrence, with humans relying on the derived grouping cues in proportion to their informativity about co-occurrence in natural sounds. The results suggest that auditory grouping is adapted to natural stimulus statistics, show how these statistics can reveal previously unappreciated grouping phenomena, and provide a framework for studying grouping in natural signals.

}, issn = {0027-8424}, doi = {10.1073/pnas.1903887116}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1903887116}, author = {M{\l}ynarski, Wiktor and Josh H. McDermott} } @conference {4533, title = {Effects of Face Familiarity in Humans and Deep Neural Networks }, booktitle = {European Conference on Visual Perception}, year = {2019}, month = {09/2019}, address = {Leuven, Belgium}, author = {Dobs, Katharina and Ian A Palmer and Joanne Yuan and Yalda Mohsenzadeh and Aude Oliva and Nancy Kanwisher} } @article {4503, title = {Evidence for an attentional priority map in inferotemporal cortex}, journal = {Proceedings of the National Academy of Sciences}, volume = {116}, year = {2019}, month = {11/2019}, pages = {23797 - 23805}, abstract = {

From incoming sensory information, our brains make selections according to current behavioral goals. This process, selective attention, is controlled by parietal and frontal areas. Here, we show that another brain area, posterior inferotemporal cortex (PITd), also exhibits the defining properties of attentional control. We discovered this area with functional magnetic resonance imaging (fMRI) during an attentive motion discrimination task. Single-cell recordings from PITd revealed strong attentional modulation across 3 attention tasks yet no tuning to task-relevant stimulus features, like motion direction or color. Instead, PITd neurons closely tracked the subject{\textquoteright}s attention state and predicted upcoming errors of attentional selection. Furthermore, artificial electrical PITd stimulation controlled the location of attentional selection without altering feature discrimination. These are the defining properties of a feature-blind priority map encoding the locus of attention. Together, these results suggest area PITd, located strategically to gather information about object properties, as an attentional priority map.

}, issn = {0027-8424}, doi = {10.1073/pnas.1821866116}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1821866116}, author = {Stemmann, Heiko and W. A. Freiwald} } @article {4141, title = {Evidence that recurrent circuits are critical to the ventral stream{\textquoteright}s execution of core object recognition behavior}, journal = {Nature Neuroscience}, year = {2019}, month = {04/2019}, abstract = {

Non-recurrent deep convolutional neural networks (DCNNs) are currently the best models of core object recognition; a behavior supported by the densely recurrent primate ventral stream, culminating in the inferior temporal (IT) cortex. Are these recurrent circuits critical to the ventral stream{\textquoteright}s execution of this behavior? We reasoned that, if recurrence is critical, then primates should outperform feedforward-only DCNNs for some images, and that these images should require additional processing time beyond the feedforward IT response. Here we first used behavioral methods to discover hundreds of these {\textquotedblleft}challenge{\textquotedblright} images. Second, using large- scale IT electrophysiology in animals performing core recognition tasks, we observed that behaviorally-sufficient, linearly-decodable object identity solutions emerged ~30ms (on average) later in IT for challenge images compared to DCNN and primate performance-matched {\textquotedblleft}control{\textquotedblright} images. We observed these same late solutions even during passive viewing. Third, consistent with a failure of feedforward computations, the behaviorally-critical late-phase IT population response patterns evoked by the challenge images were poorly predicted by DCNN activations. Interestingly, very deep CNNs as well as not-so-deep but recurrent CNNs better predicted these late IT responses, suggesting a functional equivalence between additional nonlinear transformations and recurrence. Our results argue that automatically-evoked recurrent circuits are critical even for rapid object identification. By precisely comparing current DCNNs, primate behavior and IT population dynamics, we provide guidance for future recurrent model development.

}, doi = {10.1038/s41593-019-0392-5}, url = {https://www.nature.com/articles/s41593-019-0392-5}, author = {Kohitij Kar and Jonas Kubilius and Kailyn Schmidt and Elias B. Issa and James J. DiCarlo} } @conference {4531, title = {Evidence that recurrent pathways between the prefrontal and inferior temporal cortex is critical during core object recognition }, booktitle = {Society for Neuroscience}, year = {2019}, month = {10/2019}, address = {Chicago, IL, USA}, author = {Kohitij Kar and James J. DiCarlo} } @article {4146, title = {Evolving Images for Visual Neurons Using a Deep Generative Network Reveals Coding Principles and Neuronal Preferences}, journal = {Cell }, volume = {177}, year = {2019}, month = {05/2019}, pages = {1009}, chapter = {999}, abstract = {

What specific features should visual neurons encode, given the infinity of real-world images and the limited number of neurons available to represent them? We investigated neuronal selectivity in monkey inferotemporal cortex via the vast hypothesis space of a generative deep neural network, avoiding assumptions about features or semantic categories. A genetic algorithm searched this space for stimuli that maximized neuronal firing. This led to the evolution of rich synthetic images of objects with complex combinations of shapes, colors, and textures, sometimes resembling animals or familiar people, other times revealing novel patterns that did not map to any clear\ semantic category. These results expand our conception of the dictionary of features encoded in the cortex, and the approach can potentially reveal the internal representations of any system whose input can be captured by a generative model.

}, doi = {10.1016/j.cell.2019.04.005}, url = {https://www.cell.com/cell/fulltext/S0092-8674(19)30391-5}, author = {Carlos R Ponce and Will Xiao and Peter F Schade and Till S. Hartmann and Gabriel Kreiman and Margaret S Livingstone} } @article {4555, title = {Eye movements and retinotopic tuning in developmental prosopagnosia}, journal = {Journal of Vision}, volume = {19}, year = {2019}, month = {Jan-08-2019}, pages = {7}, issn = {1534-7362}, doi = {10.1167/19.9.7}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31426085}, author = {M.F. Peterson and Ian Zaun and Hoke, Harris and Jiahui, Guo and Duchaine, Brad and Nancy Kanwisher} } @conference {4543, title = {Facial Expression Scoring and Assessment of Facial Movement Kinematics in Non-Human Primates}, booktitle = {The Rockefeller University 2019 Summer Science Research Program (SSRP)}, year = {2019}, month = {08/2019}, address = {New York, NY, USA}, author = {Obiajulu, D. and Yuriria Vazquez and G.A. Ianni and Yazdani, F. and W. A. Freiwald} } @inbook {4198, title = {Fast and Accurate Seismic Tomography via Deep Learning}, booktitle = {Deep Learning: Algorithms and Applications}, year = {2019}, publisher = {SPRINGER-VERLAG}, organization = {SPRINGER-VERLAG}, author = {Mauricio Araya-Polo and Amir Adler and Stuart Farris and Joseph Jennings} } @proceedings {4387, title = {Finding Friend and Foe in Multi-Agent Games}, year = {2019}, month = {05/2019}, address = {Vancouver, Canada}, abstract = {

Recent breakthroughs in AI for multi-agent games like Go, Poker, and Dota, have seen great strides in recent years. Yet none of these games address the real-life challenge of cooperation in the presence of unknown and uncertain teammates. This challenge is a key game mechanism in hidden role games. Here we develop the DeepRole algorithm, a multi-agent reinforcement learning agent that we test on The Resistance: Avalon, the most popular hidden role game. DeepRole combines counterfactual regret minimization (CFR) with deep value networks trained through self-play. Our algorithm integrates deductive reasoning into vector-form CFR to reason about joint beliefs and deduce partially observable actions. We augment deep value networks with constraints that yield interpretable representations of win probabilities. These innovations enable DeepRole to scale to the full Avalon game. Empirical game-theoretic methods show that DeepRole outperforms other hand-crafted and learned agents in five-player Avalon. DeepRole played with and against human players on the web in hybrid human-agent teams. We find that DeepRole outperforms human players as both a cooperator and a competitor.

}, author = {Jack Serrino and Max Kleiman-Weiner and David C. Parkes and Joshua B. Tenenbaum} } @article {4505, title = {The Generative Adversarial Brain}, journal = {Frontiers in Artificial Intelligence}, volume = {2}, year = {2019}, month = {09/2019}, abstract = {

The idea that the brain learns generative models of the world has been widely promulgated. Most approaches have assumed that the brain learns an explicit density model that assigns a probability to each possible state of the world. However, explicit density models are difficult to learn, requiring approximate inference techniques that may find poor solutions. An alternative approach is to learn an implicit density model that can sample from the generative model without evaluating the probabilities of those samples. The implicit model can be trained to fool a discriminator into believing that the samples are real. This is the idea behind generative adversarial algorithms, which have proven adept at learning realistic generative models. This paper develops an adversarial framework for probabilistic computation in the brain. It first considers how generative adversarial algorithms overcome some of the problems that vex prior theories based on explicit density models. It then discusses the psychological and neural evidence for this framework, as well as how the breakdown of the generator and discriminator could lead to delusions observed in some mental disorders.

}, doi = {10.3389/frai.2019.00018}, url = {https://www.frontiersin.org/article/10.3389/frai.2019.00018/full}, author = {Samuel J Gershman} } @conference {4449, title = {Hard choices: Children{\textquoteright}s understanding of the cost of action selection. }, booktitle = {Cognitive Science Society}, year = {2019}, author = {Shari Liu and Fiery A Cushman and Samuel J Gershman and Kool, Wouter and Elizabeth S Spelke} } @article {4298, title = {Hippocampal Remapping as Hidden State Inference}, year = {2019}, month = {08/2019}, abstract = {

Cells in the hippocampus tuned to spatial location (place cells) typically change their tuning when an animal changes context, a phenomenon known as remapping. A fundamental challenge to understanding remapping is the fact that what counts as a {\textquotedblleft}context change{\textquotedblright} has never been precisely defined. Furthermore, different remapping phenomena have been classified on the basis of how much the tuning changes after different types and degrees of context change, but the relationship between these variables is not clear. We address these ambiguities by formalizing remapping in terms of hidden state inference. According to this view, remapping does not directly reflect objective, observable properties of the environment, but rather subjective beliefs about the hidden state of the environment. We show how the hidden state framework can resolve a number of puzzles about the nature of remapping.

}, doi = {https://doi.org/10.1101/743260}, author = {Honi Sanders and Matthew A. Wilson and Samuel J Gershman} } @article {4305, title = {How Adults{\textquoteright} Actions, Outcomes, and Testimony Affect Preschoolers{\textquoteright} Persistence}, journal = {Child Development}, year = {2019}, month = {Sep-09-2019}, abstract = {

Across four experiments, we looked at how 4- and 5-year-olds{\textquoteright} (n\ =\ 520) task persistence was affected by observations of adult actions (high or low effort), outcomes (success or failure), and testimony (setting expectations{\textemdash}{\textquotedblleft}This will be hard,{\textquotedblright} pep talks{\textemdash}{\textquotedblleft}You can do this,{\textquotedblright} value statements{\textemdash}{\textquotedblleft}Trying hard is important,{\textquotedblright} and baseline). Across experiments, outcomes had the biggest impact: preschoolers consistently tried harder after seeing the adult succeed than fail. Additionally, adult effort affected children{\textquoteright}s persistence, but only when the adult succeeded. Finally, children{\textquoteright}s persistence was highest when the adult both succeeded and practiced what she preached: exerting effort while testifying to its value.

}, issn = {0009-3920}, doi = {10.1111/cdev.13305}, url = {https://srcd.onlinelibrary.wiley.com/doi/10.1111/cdev.13305}, author = {Leonard, Julia A. and Garcia, Andrea and Laura Schulz} } @article {4104, title = {How Does the Brain Represents Language and Answers Questions? Using an AI System to Understand the Underlying Neurobiological Mechanisms}, journal = {Frontiers in Computational Neuroscience}, volume = {13}, year = {2019}, month = {03/2019}, abstract = {

To understand the computations that underlie high-level cognitive processes we propose a framework of mechanisms that could in principle implement START, an AI program that answers questions using natural language. START organizes a sentence into a series of triplets, each containing three elements (subject, verb, object). We propose that the brain similarly defines triplets and then chunks the three elements into a spatial pattern. A complete sentence can be represented using up to 7 triplets in a working memory buffer organized by theta and gamma oscillations. This buffer can transfer information into long-term memory networks where a second chunking operation converts the serial triplets into a single spatial pattern in a network, with each triplet (with corresponding elements) represented in specialized subregions. The triplets that define a sentence become synaptically linked, thereby encoding the sentence in synaptic weights. When a question is posed, there is a search for the closest stored memory (having the greatest number of shared triplets). We have devised a search process that does not require that the question and the stored memory have the same number of triplets or have triplets in the same order. Once the most similar memory is recalled and undergoes 2-level dechunking, the sought for information can be obtained by element-by-element comparison of the key triplet in the question to the corresponding triplet in the retrieved memory. This search may require a reordering to align corresponding triplets, the use of pointers that link different triplets, or the use of semantic memory. Our framework uses 12 network processes; existing models can implement many of these, but in other cases we can only suggest neural implementations. Overall, our scheme provides the first view of how language-based question answering could be implemented by the brain.

}, doi = {10.3389/fncom.2019.00012}, url = {https://www.frontiersin.org/articles/10.3389/fncom.2019.00012/full}, author = {Idiart, Marco A. P. and Villavicencio, Aline and Katz, Boris and Renn{\'o}-Costa, C{\'e}sar and Lisman, John} } @article {4180, title = {How face perception unfolds over time}, journal = {Nature Communications}, volume = {10}, year = {2019}, month = {01/2019}, abstract = {

Within a fraction of a second of viewing a face, we have already determined its gender, age and identity. A full understanding of this remarkable feat will require a characterization of the computational steps it entails, along with the representations extracted at each. Here, we used magnetoencephalography (MEG) to measure the time course of neural responses to faces, thereby addressing two fundamental questions about how face processing unfolds over time. First, using representational similarity analysis, we found that facial gender and age information emerged before identity information, suggesting a coarse-to-fine processing of face dimensions. Second, identity and gender representations of familiar faces were enhanced very early on, suggesting that the behavioral benefit for familiar faces results from tuning of early feed-forward processing mechanisms. These findings start to reveal the time course of face processing in humans, and provide powerful new constraints on computational theories of face perception.

}, doi = {10.1038/s41467-019-09239-1}, url = {http://www.nature.com/articles/s41467-019-09239-1}, author = {Dobs, Katharina and Leyla Isik and Pantazis, Dimitrios and Nancy Kanwisher} } @article {4191, title = {How to never be wrong}, journal = {Psychonomic Bulletin \& Review}, volume = {26}, year = {2019}, month = {02/2019}, pages = {13 - 28}, abstract = {

Human beliefs have remarkable robustness in the face of disconfirmation. This robustness is often explained as the product of heuristics or motivated reasoning. However, robustness can also arise from purely rational principles when the reasoner has recourse to ad hoc auxiliary hypotheses. Auxiliary hypotheses primarily function as the linking assumptions connecting different beliefs to one another and to observational data, but they can also function as a "protective belt" that explains away disconfirmation by absorbing some of the blame. The present article traces the role of auxiliary hypotheses from philosophy of science to Bayesian models of cognition and a host of behavioral phenomena, demonstrating their wide-ranging implications.

}, issn = {1069-9384}, doi = {10.3758/s13423-018-1488-8}, url = {http://link.springer.com/10.3758/s13423-018-1488-8}, author = {Samuel J Gershman} } @conference {4521, title = {Identification of vigilance states in freely behaving animals using thalamocortical activity and Deep Belief networks}, booktitle = {Society for Neuroscience}, year = {2019}, month = {10/2019}, author = {Jordan Harrod and Patrick L. Purdon and Emery N. Brown and Francisco J. Flores} } @article {4509, title = {Illusory sound texture reveals multi-second statistical completion in auditory scene analysis}, journal = {Nature Communications}, volume = {10}, year = {2019}, month = {11/2019}, abstract = {

Sound sources in the world are experienced as stable even when intermittently obscured, implying perceptual completion mechanisms that "fill in"\ missing sensory information. We demonstrate a filling-in phenomenon in which the brain extrapolates the statistics of background sounds (textures) over periods of several seconds when they are interrupted by another sound, producing vivid percepts of illusory texture. The effect differs from previously described completion effects in that 1) the extrapolated sound must be defined statistically given the stochastic nature of texture, and 2) the effect lasts much longer, enabling introspection and facilitating assessment of the underlying representation. Illusory texture biases subsequent texture statistic estimates indistinguishably from actual texture, suggesting that it is represented similarly to actual texture. The illusion appears to represent an inference about whether the background is likely to continue during concurrent sounds, providing a stable statistical representation of the ongoing environment despite unstable sensory evidence.

}, doi = {10.1038/s41467-019-12893-0}, url = {http://www.nature.com/articles/s41467-019-12893-0}, author = {McWalter, Richard and Josh H. McDermott} } @proceedings {4386, title = {Implicit Regularization of Accelerated Methods in Hilbert Spaces}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We study learning properties of accelerated gradient descent methods for linear least-squares in Hilbert spaces. We analyze the implicit regularization properties of Nesterov acceleration and a variant of heavy-ball in terms of corresponding learning error bounds. Our results show that acceleration can provides faster bias decay than gradient descent, but also suffers of a more unstable behavior. As a result acceleration cannot be in general expected to improve learning accuracy with respect to gradient descent, but rather to achieve the same accuracy with reduced computations. Our theoretical results are validated by numerical simulations. Our analysis is based on studying suitable polynomials induced by the accelerated dynamics and combining spectral techniques with concentration inequalities.

}, author = {Nicol{\`o} Pagliana and Lorenzo Rosasco} } @conference {4520, title = {In silico modeling of temporally interfering electric fields for deep brain stimulation }, booktitle = {Society for Neuroscience}, year = {2019}, month = {10/2019}, address = {Chicago, IL, USA}, author = {Isabella Dalla Betta and Antonino Cassara and Edward S Boyden and Emery N. Brown and francisco and Francisco J. Flores} } @article {4199, title = {Incentives Boost Model-Based Control Across a Range of Severity on Several Psychiatric Constructs}, journal = {Biological Psychiatry}, volume = {85}, year = {2019}, month = {03/2019}, pages = {425 - 433}, abstract = {

Background

Human decision making exhibits a mixture of model-based and model-free control. Recent evidence indicates that arbitration between these two modes of control ({\textquotedblleft}metacontrol{\textquotedblright}) is based on their relative costs and benefits. While model-based control may increase accuracy, it requires greater computational resources, so people invoke model-based control only when potential rewards exceed those of model-free control. We used a sequential decision task, while concurrently manipulating performance incentives, to ask if symptoms and traits of psychopathology decrease or increase model-based control in response to incentives.

Methods

We recruited a nonpatient population of 839 online participants using Amazon Mechanical Turk who completed transdiagnostic self-report measures encompassing symptoms, traits, and factors. We fit a dual-controller reinforcement learning model and obtained a computational measure of model-based control separately for small incentives and large incentives.

Results

None of the constructs were related to a failure of large incentives to boost model-based control. In fact, for the sensation seeking trait and anxious-depression factor, higher scores were associated with a larger incentive effect, whereby greater levels of these constructs were associated with larger increases in model-based control. Many constructs showed decreases in model-based control as a function of severity, but a social withdrawal factor was positively correlated; alcohol use and social anxiety were unrelated to model-based control.

Conclusions

Our results demonstrate that model-based control can reliably be improved independent of construct severity for most measures. This suggests that incentives may be a useful intervention for boosting model-based control across a range of symptom and trait severity.

}, keywords = {Computational psychiatry, Habits and goals Incentives, Model-based control, Psychiatric constructs, reinforcement learning}, issn = {00063223}, doi = {10.1016/j.biopsych.2018.06.018}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0006322318316329}, author = {Patzelt, Edward H. and Kool, Wouter and Millner, Alexander J. and Samuel J Gershman} } @article {4178, title = {An integrative computational architecture for object-driven cortex}, journal = {Current Opinion in Neurobiology}, volume = {55}, year = {2019}, month = {01/2019}, pages = {73 - 81}, abstract = {

Computational architecture for object-driven cortex

Objects in motion activate multiple cortical regions in every lobe of the human brain. Do these regions represent a collection of independent systems, or is there an overarching functional architecture spanning all of object-driven cortex? Inspired by recent work in artificial intelligence (AI), machine learning, and cognitive science, we consider the hypothesis that these regions can be understood as a coherent network implementing an integrative computational system that unifies the functions needed to perceive, predict, reason about, and plan with physical objects{\textemdash}as in the paradigmatic case of using or making tools. Our proposal draws on a modeling framework that combines multiple AI methods, including causal generative models, hybrid symbolic-continuous planning algorithms, and neural recognition networks, with object-centric, physics-based representations. We review evidence relating specific components of our proposal to the specific regions that comprise object-driven cortex, and lay out future research directions with the goal of building a complete functional and mechanistic account of this system.

}, issn = {09594388}, doi = {10.1016/j.conb.2019.01.010}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0959438818301995}, author = {Ilker Yildirim and Jiajun Wu and Nancy Kanwisher and Joshua B. Tenenbaum} } @article {4507, title = {Invariance to background noise as a signature of non-primary auditory cortex}, journal = {Nature Communications}, volume = {10}, year = {2019}, month = {09/2019}, abstract = {

Despite well-established anatomical differences between primary and non-primary auditory cortex, the associated representational transformations have remained elusive. Here we show that primary and non-primary auditory cortex are differentiated by their invariance to real-world background noise. We measured fMRI responses to natural sounds presented in isolation and in real-world noise, quantifying invariance as the correlation between the two responses for individual voxels. Non-primary areas were substantially more noise-invariant than primary areas. This primary-nonprimary difference occurred both for speech and non-speech sounds and was unaffected by a concurrent demanding visual task, suggesting that the observed invariance is not specific to speech processing and is robust to inattention. The difference was most pronounced for real-world background noise {\textendash} both primary and non-primary areas were relatively robust to simple types of synthetic noise. Our results suggest a general representational transformation between auditory cortical stages, illustrating a representational consequence of hierarchical organization in the auditory system.

}, doi = {10.1038/s41467-019-11710-y}, url = {http://www.nature.com/articles/s41467-019-11710-y}, author = {Alexander J. E. Kell and Josh H. McDermott} } @article {4554, title = {Invariant representations of mass in the human brain}, journal = {eLife}, volume = {8}, year = {2019}, month = {May-12-2020}, doi = {10.7554/eLife.46619}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31845887}, author = {Schwettmann, Sarah and Joshua B. Tenenbaum and Nancy Kanwisher} } @article {4791, title = {It{\textquoteright}s a small dimensional world after all}, journal = {Physics of Life Reviews}, volume = {29}, year = {2019}, month = {07/2019}, pages = {96 - 97}, issn = {15710645}, doi = {10.1016/j.plrev.2019.03.015}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1571064519300612}, author = {Gabriel Kreiman} } @article {4508, title = {Judgments of effort for magical violations of intuitive physics}, journal = {PLOS ONE}, volume = {14}, year = {2019}, month = {05/2020}, pages = {e0217513}, abstract = {

People spend much of their time in imaginary worlds, and have beliefs about the events that are likely in those worlds, and the laws that govern them. Such beliefs are likely affected by people{\textquoteright}s intuitive theories of the real world. In three studies, people judged the effort required to cast spells that cause physical violations. People ranked the actions of spells congruently with intuitive physics. For example, people judge that it requires more effort to conjure up a frog than to levitate it one foot off the ground. A second study manipulated the target and extent of the spells, and demonstrated with a continuous measure that people are sensitive to this manipulation even between participants. A pre-registered third study replicated the results of Study 2. These results suggest that people{\textquoteright}s intuitive theories partly account for how they think about imaginary worlds.

}, doi = {10.1371/journal.pone.021751310.1371}, url = {http://dx.plos.org/10.1371/journal.pone.0217513}, author = {John P. McCoy and Ullman, Tomer}, editor = {Capraro, Valerio} } @article {4194, title = {Language, gesture, and judgment: Children{\textquoteright}s paths to abstract geometry}, journal = {Journal of Experimental Child Psychology}, volume = {177}, year = {2019}, month = {01/2019}, pages = {70 - 85}, abstract = {

As infants, children are sensitive to geometry when recognizing objects or navigating through rooms; however, explicit knowledge of geometry develops slowly and may be unstable even in adults. How can geometric concepts be both so accessible and so elusive? To examine how implicit and explicit geometric concepts develop, the current study assessed, in 132 children (3-8 years old) while they played a simple geometric judgment task, three distinctive channels: children{\textquoteright}s choices during the game as well as the language and gestures they used to justify and accompany their choices. Results showed that, for certain geometric properties, children chose the correct card even if they could not express with words (or gestures) why they had made this choice. Furthermore, other geometric concepts were expressed and supported by gestures prior to their articulation in either choices or speech. These findings reveal that gestures and behavioral choices may reflect implicit knowledge and serve as a foundation for the development of geometric reasoning. Altogether, our results suggest that language alone might not be enough for expressing and organizing geometric concepts and that children pursue multiple paths to overcome its limitations, a finding with potential implications for primary education in mathematics.

}, keywords = {Explicit knowledge; Geometrical reasoning; Gestures; Implicit knowledge; Language; Thought}, issn = {00220965}, doi = {10.1016/j.jecp.2018.07.015}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0022096517306252}, author = {Calero, Cecilia I. and Shalom, Diego E. and Elizabeth S Spelke and Sigman, Mariano} } @conference {4321, title = {Large-scale hyperparameter search for predicting human brain responses in the Algonauts challenge}, booktitle = { The Algonauts Project: Explaining the Human Visual Brain Workshop 2019 }, year = {2019}, month = {8/14/2019}, address = {MIT, Cambridge MA}, doi = {10.1101/689844 }, url = {https://www.biorxiv.org/content/10.1101/689844v2.full}, author = {K.M. Jozwik and Lee, M. and Marques, T. and Martin Schrimpf and Pouya Bashivan} } @conference {4518, title = {Learning Language from Vision.}, booktitle = {Workshop on Visually Grounded Interaction and Language (ViGIL) at the Thirty-third Annual Conference on Neural Information Processing Systems (NeurIPS)}, year = {2019}, month = {12/2019}, address = {Vancouver Convention Center, Vancouver, Canada}, author = {Candace Ross and Yevgeni Berzak and Boris Katz and Andrei Barbu} } @conference {4530, title = {A meta-analysis of ANNs as models of primate V1 }, booktitle = {Bernstein}, year = {2019}, month = {09/2019}, address = {Berlin, Germany}, author = {Tiago Marques and James J. DiCarlo} } @proceedings {4373, title = {Metamers of neural networks reveal divergence from human perceptual systems}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Deep neural networks have been embraced as models of sensory systems, instantiating representational transformations that appear to resemble those in the visual and auditory systems. To more thoroughly investigate their similarity to biological systems, we synthesized model metamers {\textendash} stimuli that produce the same responses at some stage of a network{\textquoteright}s representation. We generated model metamers for natural stimuli by performing gradient descent on a noise signal, matching the responses of individual layers of image and audio networks to a natural image or speech signal. The resulting signals reflect the invariances instantiated in the network up to the matched layer. We then measured whether model metamers were recognizable to human observers {\textendash} a necessary condition for the model representations to replicate those of humans. Although model metamers from early network layers were recognizable to humans, those from deeper layers were not. Auditory model metamers became more human-recognizable with architectural modifications that reduced aliasing from pooling operations, but those from the deepest layers remained unrecognizable. We also used the metamer test to compare model representations. Cross-model metamer recognition dropped off for deeper layers, roughly at the same point that human recognition deteriorated, indicating divergence across model representations. The results reveal discrepancies between model and human representations, but also show how metamers can help guide model refinement and elucidate model representations.

}, url = {https://papers.nips.cc/paper/9198-metamers-of-neural-networks-reveal-divergence-from-human-perceptual-systems}, author = {Jenelle Feather and Alex Durango and Ray Gonzalez and Josh H. McDermott} } @conference {4105, title = {Minimal images in deep neural networks: Fragile Object Recognition in Natural Images}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2019}, address = {New Orleans, La}, abstract = {

The human ability to recognize objects is impaired when the object is not shown in full. "Minimal images" are the smallest regions of an image that remain recognizable for humans. Ullman et al. 2016 show that a slight modification of the location and size of the visible region of the minimal image produces a sharp drop in human recognition accuracy. In this paper, we demonstrate that such drops in accuracy due to changes of the visible region are a common phenomenon between humans and existing state-of-the-art deep neural networks (DNNs), and are much more prominent in DNNs. We found many cases where DNNs classified one region correctly and the other incorrectly, though they only differed by one row or column of pixels, and were often bigger than the average human minimal image size. We show that this phenomenon is independent from previous works that have reported lack of invariance to minor modifications in object location in DNNs. Our results thus reveal a new failure mode of DNNs that also affects humans to a much lesser degree. They expose how fragile DNN recognition ability is for natural images even without adversarial patterns being introduced. Bringing the robustness of DNNs in natural images to the human level remains an open challenge for the community.

}, url = {https://arxiv.org/pdf/1902.03227.pdf}, author = {S. Srivastava and Guy Ben-Yosef and X. Boix} } @article {4196, title = {A model for discovering {\textquoteleft}containment{\textquoteright} relations}, journal = {Cognition}, volume = {183}, year = {2019}, month = {02/2019}, pages = {67 - 81}, abstract = {

Rapid developments in the fields of learning and object recognition have been obtained by successfully developing and using methods for learning from a large number of labeled image examples. However, such current methods cannot explain infants{\textquoteright} learning of new concepts based on their visual experience, in particular, the ability to learn complex concepts without external guidance, as well as the natural order in which related concepts are acquired. A remarkable example of early visual learning is the category of {\textquoteright}containers{\textquoteright} and the notion of {\textquoteright}containment{\textquoteright}. Surprisingly, this is one of the earliest spatial relations to be learned, starting already around 3 month of age, and preceding other common relations (e.g., {\textquoteright}support{\textquoteright}, {\textquoteright}in-between{\textquoteright}). In this work we present a model, which explains infants{\textquoteright} capacity of learning {\textquoteright}containment{\textquoteright} and related concepts by {\textquoteright}just looking{\textquoteright}, together with their empirical development trajectory. Learning occurs in the model fast and without external guidance, relying only on perceptual processes that are present in the first months of life. Instead of labeled training examples, the system provides its own internal supervision to guide the learning process. We show how the detection of so-called {\textquoteright}paradoxical occlusion{\textquoteright} provides natural internal supervision, which guides the system to gradually acquire a range of useful containment-related concepts. Similar mechanisms of using implicit internal supervision can have broad application in other cognitive domains as well as artificial intelligent systems, because they alleviate the need for supplying extensive external supervision, and because they can guide the learning process to extract concepts that are meaningful to the observer, even if they are not by themselves obvious, or salient in the input.

}, keywords = {Computational model; Containment relation; Developmental trajectory; Infants{\textquoteright} perceptual learning; Spatial relations learning; Unsupervised learning}, issn = {00100277}, doi = {10.1016/j.cognition.2018.11.001}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010027718302877}, author = {Shimon Ullman and Dorfman, Nimrod and Harari, Daniel} } @proceedings {4380, title = {Modeling Expectation Violation in Intuitive Physics with Coarse Probabilistic Object Representations}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

From infancy, humans have expectations about how objects will move and interact. Even young children expect objects not to move through one another, teleport, or disappear. They are surprised by mismatches between physical expectations and perceptual observations, even in unfamiliar scenes with completely novel objects. A model that exhibits human-like understanding of physics should be similarly surprised, and adjust its beliefs accordingly. We propose ADEPT, a model that uses a coarse (approximate geometry) object-centric representation for dynamic 3D scene understanding. Inference integrates deep recognition networks, extended probabilistic physical simulation, and particle filtering for forming predictions and expectations across occlusion. We also present a new test set for measuring violations of physical expectations, using a range of scenarios derived from de- velopmental psychology. We systematically compare ADEPT, baseline models, and human expectations on this test set. ADEPT outperforms standard network architectures in discriminating physically implausible scenes, and often performs this discrimination at the same level as people.

}, url = {http: //physadept.csail.mit.edu/}, author = {Kevin A Smith and Lingjie Mei and Shunyu Yao and Jiajun Wu and Elizabeth S Spelke and Joshua B. Tenenbaum and Tomer D. Ullman} } @conference {4535, title = {Neural mechanisms supporting facial expressions }, booktitle = {unknown}, year = {2019}, month = {04/2019}, address = {New York, NY, USA}, author = {Yuriria Vazquez and Geena Ianni and W. A. Freiwald} } @article {4143, title = {Neural Population Control via Deep Image Synthesis}, journal = {Science}, volume = {364}, year = {2019}, month = {05/2019}, abstract = {Particular deep artificial neural networks (ANNs) are today{\textquoteright}s most accurate models of the primate brain{\textquoteright}s ventral visual stream. Here we report that, using an ANN-driven image synthesis method, new luminous power patterns (i.e. images) can be applied to the primate retinae to predictably push the spiking activity of targeted V4 neural sites beyond naturally occurring levels. More importantly, this method, while not yet perfect, achieves unprecedented independent control of the activity state of entire populations of V4 neural sites, even those with overlapping receptive fields. These results show how the knowledge embedded in today{\textquoteright}s ANN models might be used to noninvasively set desired internal brain states at neuron-level resolution, and suggest that more accurate ANN models would produce even more accurate control. }, doi = {10.1126/science.aav9436 }, url = {https://science.sciencemag.org/content/364/6439/eaav9436}, author = {Pouya Bashivan and Kohitij Kar and James J. DiCarlo} } @proceedings {4388, title = {ObjectNet: A large-scale bias-controlled dataset for pushing the limits of object recognition models}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We collect a large real-world test set, ObjectNet, for object recognition with controls where object backgrounds, rotations, and imaging viewpoints are random. Most scientific experiments have controls, confounds which are removed from the data, to ensure that subjects cannot perform a task by exploiting trivial correlations in the data. Historically, large machine learning and computer vision datasets have lacked such controls. This has resulted in models that must be fine-tuned for new datasets and perform better on datasets than in real-world applications. When tested on ObjectNet, object detectors show a 40-45\% drop in performance, with respect to their performance on other benchmarks, due to the controls for biases. Controls make ObjectNet robust to fine-tuning showing only small performance increases. We develop a highly automated platform that enables gathering datasets with controls by crowdsourcing image capturing and annotation. ObjectNet is the same size as the ImageNet test set (50,000 images), and by design does not come paired with a training set in order to encourage generalization. The dataset is both easier than ImageNet (objects are largely centered and unoccluded) and harder (due to the controls). Although we focus on object recognition here, data with controls can be gathered at scale using automated tools throughout machine learning to generate datasets that exercise models in new ways thus providing valuable feedback to researchers. This work opens up new avenues for research in generalizable, robust, and more human-like computer vision and in creating datasets where results are predictive of real-world performance.

}, author = {Andrei Barbu and David Mayo and Julian Alverio and William Luo and Christopher Wang and Dan Gutfreund and Joshua B. Tenenbaum and Boris Katz} } @article {4551, title = {Open Source Subject Database Project (OSSDP)}, year = {2019}, month = {11/2019}, publisher = {Center for Brains, Minds and Machines}, abstract = {

Working with some of our human studies laboratories, we have developed a website-wrapped database system for self-signup, contact tracking, visit tracking and other related data for experiments with human subjects.

The Open Source Subject Database Project, provided by the Center for Brains, Minds and Machines, is a free*, open source alternative to systems that require annual fees, expensive developers and proprietary software, such as FileMaker Pro, etc. It is built on the free and open source Drupal 8 platform (PHP, MySQL, Apache), which is widely used in Higher Ed, government, corporations and many others and is fully customizable.

We are currenlty on v2.0.1 updating to synced Family/Child records with field standardized age calculations and more.

Read more, view screenshots, and download the software on the OSSDP mini site.

}, author = {Kris Brewer and Ben Mittman and Jonathan Kominsky and John Henes} } @article {4282, title = {Origins of the concepts cause, cost, and goal in prereaching infants}, journal = {PNAS}, year = {2019}, month = {08/2019}, abstract = {

We investigated the origins and interrelations of causal knowledge and knowledge of agency in 3-month-old infants, who cannot yet effect changes in the world by reaching for, grasping, and picking up objects. Across 5 experiments, n = 152 prereaching infants viewed object-directed reaches that varied in efficiency (following the shortest physically possible path vs. a longer path), goal (lifting an object vs. causing a change in its state), and causal structure (action on contact vs. action at a distance and after a delay). Prereaching infants showed no strong looking preference between a person{\textquoteright}s efficient and inefficient reaches when the person grasped and displaced an object. When the person reached for and caused a change in the state of the object on contact, however, infants looked longer when this action was inefficient than when it was efficient. Three-month-old infants also showed a key signature of adults{\textquoteright} and older infants{\textquoteright} causal inferences: This looking preference was abolished if a short spatial and temporal gap separated the action from its effect. The basic intuition that people are causal agents, who navigate around physical constraints to change the state of the world, may be one important foundation for infants{\textquoteright} ability to plan their own actions and learn from the acts of others.

}, issn = {1091-6490}, doi = {https://doi.org/10.1073/pnas.1904410116}, url = {https://www.pnas.org/content/early/2019/08/19/1904410116/tab-article-info}, author = {Shari Liu and Neon B. Brooks and Elizabeth S Spelke} } @article {4452, title = {Origins of the concepts cause, cost, and goal in prereaching infants.}, year = {2019}, author = {Shari Liu and Neon B. Brooks and Elizabeth S Spelke} } @article {4502, title = {Parts-based representations of perceived face movements in the superior temporal sulcus}, journal = {Human Brain Mapping}, volume = {40}, year = {2019}, month = {02/2019}, pages = {2499 - 2510}, abstract = {

Facial motion is a primary source of social information about other humans. Prior fMRI studies have identified regions of the superior temporal sulcus (STS) that respond specifically to perceived face movements (termed fSTS), but little is known about the nature of motion representations in these regions. Here we use fMRI and multivoxel pattern analysis to characterize the representational content of the fSTS. Participants viewed a set of specific eye and mouth movements, as well as combined eye and mouth movements. Our results demonstrate that fSTS response patterns contain information about face movements, including subtle distinctions between types of eye and mouth movements. These representations generalize across the actor performing the movement, and across small differences in visual position. Critically, patterns of response to combined movements could be well predicted by linear combinations of responses to individual eye and mouth movements, pointing to a parts-based representation of complex face movements. These results indicate that the fSTS plays an intermediate role in the process of inferring social content from visually perceived face movements, containing a representation that is sufficiently abstract to generalize across low-level visual details, but still tied to the kinematics of face part movements.

}, issn = {1065-9471}, doi = {10.1002/hbm.v40.810.1002/hbm.24540}, url = {https://onlinelibrary.wiley.com/toc/10970193/40/8}, author = {Ben Deen and Rebecca Saxe} } @conference {4450, title = {People{\textquoteright}s perceptions of others{\textquoteright} risk preferences.}, booktitle = {Cognitive Science Society}, year = {2019}, author = {Shari Liu and John P. McCoy and Ullman, Tomer D.} } @article {4500, title = {A perceptually inspired generative model of rigid-body contact sounds}, journal = {Proceedings of the 22nd International Conference on Digital Audio Effects (DAFx-19)}, year = {2019}, month = {09/2019}, abstract = {

Contact between rigid-body objects produces a diversity of impact and friction sounds. These sounds can be synthesized with detailed simulations of the motion, vibration and sound radiation of the objects, but such synthesis is computationally expensive and prohibitively slow for many applications. Moreover, detailed physical simulations may not be necessary for perceptually compelling synthesis; humans infer ecologically relevant causes of sound, such as material categories, but not with arbitrary precision. We present a generative model of impact sounds which summarizes the effect of physical variables on acoustic features via statistical distributions fit to empirical measurements of object acoustics. Perceptual experiments show that sampling from these distributions allows efficient synthesis of realistic impact and scraping sounds that convey material, mass, and motion.

}, author = {James Traer and Maddie Cusimano and Josh H. McDermott} } @conference {4537, title = {Properties of invariant object recognition in human one-shot learning suggests a hierarchical architecture different from deep convolutional neural networks}, booktitle = {Vision Science Society}, year = {2019}, month = {05/2019}, address = {Florida, USA}, author = {Yena Han and Gemma Roig and Geiger, Gad and Tomaso Poggio} } @conference {4541, title = {Properties of invariant object recognition in human oneshot learning suggests a hierarchical architecture different from deep convolutional neural networks }, booktitle = {Vision Science Society}, year = {2019}, month = {05/2019}, address = {St Pete Beach, FL, USA}, doi = {10.1167/19.10.28d}, url = {https://jov.arvojournals.org/article.aspx?articleid=2749961https://jov.arvojournals.org/article.aspx?articleid=2749961}, author = {Yena Han and Gemma Roig and Geiger, Gad and Tomaso Poggio} } @conference {4544, title = {Pupillary responses track changes in arousal and attention while exploring a virtual reality environment}, booktitle = {The Rockefeller University 2019 Summer Undergraduate Research Fellowship (SURF) Program}, year = {2019}, month = {08/2019}, address = {New York, NY, USA}, author = {Otero Coronel, Santiago and Phillips-Jones, Taylor and Sani, Ilaria and W. A. Freiwald} } @conference {4523, title = {Query-guided visual search }, booktitle = {41st Annual conference of the Cognitive Science Society}, year = {2019}, month = {07/2019}, address = {Montreal, Qu{\'e}bec, Canada}, author = {Junyi Chu and Jon Gauthier and Roger Levy and Joshua B. Tenenbaum and Laura Schulz} } @article {4556, title = {Representational similarity precedes category selectivity in the developing ventral visual pathway}, journal = {NeuroImage}, volume = {197}, year = {2019}, month = {Jan-08-2019}, pages = {565 - 574}, issn = {10538119}, doi = {10.1016/j.neuroimage.2019.05.010}, url = {https://www.ncbi.nlm.nih.gov/pubmed/31077844}, author = {Cohen, Michael A. and Dilks, Daniel D. and Kami Koldewyn and Weigelt, Sarah and Jenelle Feather and Alexander J. E. Kell and Keil, Boris and Fischl, Bruce and Z{\"o}llei, Lilla and Lawrence Wald and Rebecca Saxe and Nancy Kanwisher} } @conference {4526, title = {Scrape, rub, and roll: causal inference in the perception of sustained contact sounds }, booktitle = {Cognitive Science}, year = {2019}, month = {07/2019}, address = {Montreal, Qu{\'e}bec, Canada}, author = {Maddie Cusimano and James Traer and Josh H. McDermott} } @article {4118, title = {See, feel, act: Hierarchical learning for complex manipulation skills with multisensory fusion}, journal = {Science Robotics}, volume = {4}, year = {2019}, month = {01/2019}, pages = {eaav3123}, abstract = {

{Humans are able to seamlessly integrate tactile and visual stimuli with their intuitions to explore and execute complex manipulation skills. They not only see but also feel their actions. Most current robotic learning methodologies exploit recent progress in computer vision and deep learning to acquire data-hungry pixel-to-action policies. These methodologies do not exploit intuitive latent structure in physics or tactile signatures. Tactile reasoning is omnipresent in the animal kingdom, yet it is underdeveloped in robotic manipulation. Tactile stimuli are only acquired through invasive interaction, and interpretation of the data stream together with visual stimuli is challenging. Here, we propose a methodology to emulate hierarchical reasoning and multisensory fusion in a robot that learns to play Jenga, a complex game that requires physical interaction to be played effectively. The game mechanics were formulated as a generative process using a temporal hierarchical Bayesian model, with representations for both behavioral archetypes and noisy block states. This model captured descriptive latent structures, and the robot learned probabilistic models of these relationships in force and visual domains through a short exploration phase. Once learned, the robot used this representation to infer block behavior patterns and states as it played the game. Using its inferred beliefs, the robot adjusted its behavior with respect to both its current actions and its game strategy, similar to the way humans play the game. We evaluated the performance of the approach against three standard baselines and show its fidelity on a real-world implementation of the game.

}, doi = {10.1126/scirobotics.aav3123}, url = {http://robotics.sciencemag.org/lookup/doi/10.1126/scirobotics.aav3123}, author = {Fazeli, N. and Oller, M. and Wu, J. and Wu, Z. and Joshua B. Tenenbaum and Rodriguez, A.} } @article {4281, title = {Theoretical Issues in Deep Networks}, year = {2019}, month = {08/2019}, abstract = {

While deep learning is successful in a number of applications, it is not yet well understood theoretically.\  A theoretical\ characterization of deep learning should answer questions about their approximation power, the dynamics of optimization by gradient descent and good out-of-sample performance --- why the expected error does not suffer, despite the absence of explicit regularization, when the networks are overparametrized. We review our recent results towards this goal.\ In {\it approximation theory} both shallow and deep networks are known to approximate any continuous functions on a bounded domain at a cost which is exponential (the number of parameters is exponential in the dimensionality of the function). However, we proved that for certain types of compositional functions, deep networks of the convolutional type (even without weight sharing) can have a linear dependence on dimensionality, unlike shallow networks. In characterizing {\it minimization} of the empirical exponential loss we consider the gradient descent dynamics of the weight directions rather than the weights themselves, since the relevant function underlying classification corresponds to the normalized network. The dynamics of the normalized weights implied by standard gradient descent turns out to be equivalent to the dynamics of the constrained problem of minimizing an exponential-type loss subject to a unit $L_2$ norm constraint. In particular, the dynamics of the typical, unconstrained gradient descent converges to the same critical points of the constrained problem. Thus, there is {\it implicit regularization} in training deep networks under exponential-type loss functions with gradient descent. The critical points of the flow are hyperbolic minima (for any long but finite time) and minimum norm minimizers (e.g. maxima of the margin). Though appropriately normalized networks can show a small generalization gap (difference between empirical and expected loss) even for finite $N$ (number of training examples) wrt the exponential loss, they do not generalize in terms of the classification error. Bounds on it for finite $N$ remain an open problem. Nevertheless, our results, together with other recent papers, characterize an implicit vanishing regularization by gradient descent which is likely to be a key prerequisite -- in terms of complexity control -- for the good performance of deep overparametrized ReLU classifiers.

}, author = {Tomaso Poggio and Andrzej Banburski and Qianli Liao} } @article {4515, title = {Theories of Deep Learning: Approximation, Optimization and Generalization }, year = {2019}, month = {09/2019}, author = {Qianli Liao and Andrzej Banburski and Tomaso Poggio} } @conference {4322, title = {To find better neural network models of human vision, find better neural network models of primate vision}, booktitle = {BioRxiv}, year = {2019}, abstract = {

Specific deep artificial neural networks (ANNs) are the current best models of ventral visual processing and object recognition behavior in monkeys. We here explore whether models of non-human primate vision generalize to visual processing in the human primate brain. Specifically, we asked if model match to monkey IT is a predictor of model match to human IT, even when scoring those matches on different images. We found that the model match to monkey IT is a positive predictor of the model match to human IT (R = 0.36), and that this approach outperforms the current standard predictor of model accuracy on ImageNet. This suggests a more powerful approach for pre-selecting models as hypotheses of human brain processing.

}, url = {https://www.biorxiv.org/content/10.1101/688390v1.full}, author = {K.M. Jozwik and Martin Schrimpf and Nancy Kanwisher and James J. DiCarlo} } @article {4201, title = {The transdiagnostic structure of mental effort avoidance}, journal = {Scientific Reports}, volume = {9}, year = {2019}, month = {02/2019}, abstract = {

The law of least mental effort states that, everything else being equal, the brain tries to minimize mental effort expenditure during task performance by avoiding decisions that require greater cognitive demands. Prior studies have shown associations between disruptions in effort expenditure and specific psychiatric illnesses (e.g., schizophrenia and depression) or clinically-related symptoms and traits (e.g., anhedonia and apathy), yet no research has explored this issue transdiagnostically. Specifically, this research has largely focused on a single diagnostic category, symptom, or trait. However, abnormalities in effort expression could be related to several different psychiatrically-relevant constructs that cut across diagnostic boundaries. Therefore, we examined the relationship between avoidance of mental effort and a diverse set of clinically-related symptoms and traits, and transdiagnostic latent factors in a large sample (n = 811). Only lack of perseverance, a dimension of impulsiveness, was associated with increased avoidance of mental effort. In contrast, several constructs were associated with less mental effort avoidance, including positive urgency, distress intolerance, obsessive-compulsive symptoms, disordered eating, and a factor consisting of compulsive behavior and intrusive thoughts. These findings demonstrate that deviations from normative effort expenditure are associated with a number of constructs that are common to several forms of psychiatric illness.

}, doi = {10.1038/s41598-018-37802-1}, url = {http://www.nature.com/articles/s41598-018-37802-1}, author = {Patzelt, Edward H. and Kool, Wouter and Millner, Alexander J. and Samuel J Gershman} } @article {4313, title = {Universal and Non-universal Features of Musical Pitch Perception Revealed by Singing}, journal = {Current Biology}, year = {2019}, month = {09/2019}, abstract = {

Musical pitch perception is argued to result from nonmusical biological constraints and thus to have similar characteristics across cultures, but its universality remains unclear. We probed pitch representations in residents of the Bolivian Amazon{\textemdash}the Tsimane{\textquoteright}, who live in relative isolation from Western culture{\textemdash}as well as US musicians and non-musicians. Participants sang back tone sequences presented in different frequency ranges. Sung responses of Amazonian and US participants approximately replicated heard intervals on a logarithmic scale, even for tones outside the singing range. Moreover, Amazonian and US reproductions both deteriorated for high-frequency tones even though they were fully audible. But whereas US participants tended to reproduce notes an integer number of octaves above or below the heard tones, Amazonians did not, ignoring the note {\textquotedblleft}chroma{\textquotedblright} (C, D, etc.). Chroma matching in US participants was more pronounced in US musicians than non-musicians, was not affected by feedback, and was correlated with similarity-based measures of octave equivalence as well as the ability to match the absolute f0 of a stimulus in the singing range. The results suggest the cross-cultural presence of logarithmic scales for pitch, and biological constraints on the limits of pitch, but indicate that octave equivalence may be culturally contingent, plausibly dependent on pitch representations that develop from experience with particular musical systems.

}, keywords = {absolute pitch, bio-musicology, cross-cultural psychology, mental scales, music cognition, octave equivalence, pitch, relative pitch, singing, Tsimane{\textquoteright}}, issn = {09609822}, doi = {10.1016/j.cub.2019.08.020}, url = {https://linkinghub.elsevier.com/retrieve/pii/S096098221931036X}, author = {Jacoby, Nori and Undurraga, Eduardo A. and McPherson, Malinda J. and Vald{\'e}s, Joaqu{\'\i}n and Ossand{\'o}n, Tom{\'a}s and Josh H. McDermott} } @proceedings {4389, title = {Untangling in Invariant Speech Recognition}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Encouraged by the success of deep convolutional neural networks on a variety of visual tasks, much theoretical and experimental work has been aimed at understanding and interpreting how vision networks operate. At the same time, deep neural networks have also achieved impressive performance in audio processing applications, both as sub-components of larger systems and as complete end-to-end systems by themselves. Despite their empirical successes, comparatively little is understood about how these audio models accomplish these tasks.In this work, we employ a recently developed statistical mechanical theory that connects geometric properties of network representations and the separability of classes to probe how information is untangled within neural networks trained to recognize speech. We observe that speaker-specific nuisance variations are discarded by the network{\textquoteright}s hierarchy, whereas task-relevant properties such as words and phonemes are untangled in later layers. Higher level concepts such as parts-of-speech and context dependence also emerge in the later layers of the network. Finally, we find that the deep representations carry out significant temporal untangling by efficiently extracting task-relevant features at each time step of the computation. Taken together, these findings shed light on how deep auditory models process their time dependent input signals to carry out invariant speech recognition, and show how different concepts emerge through the layers of the network.

}, author = {Cory Stephenson and Jenelle Feather and Suchismita Padhy and Oguz Elibol and Hanlin Tang and Josh H. McDermott and SueYeon Chung} } @article {4087, title = {Using neuroscience to develop artificial intelligence}, journal = {Science}, volume = {363}, year = {2019}, month = {02/2019}, pages = {692 - 693}, chapter = {692}, abstract = {

When the mathematician Alan Turing posed the question {\textquotedblleft}Can machines think?{\textquotedblright} in the first line of his seminal 1950 paper that ushered in the quest for artificial intelligence (AI) (1), the only known systems carrying out complex computations were biological nervous systems. It is not surprising, therefore, that scientists in the nascent field of AI turned to brain circuits as a source for guidance. One path that was taken since the early attempts to perform intelligent computation by brain-like circuits (2), and which led recently to remarkable successes, can be described as a highly reductionist approach to model cortical circuitry. In its basic current form, known as a {\textquotedblleft}deep network{\textquotedblright} (or deep net) architecture, this brain-inspired model is built from successive layers of neuron-like elements, connected by adjustable weights, called {\textquotedblleft}synapses{\textquotedblright} after their biological counterparts (3). The application of deep nets and related methods to AI systems has been transformative. They proved superior to previously known methods in central areas of AI research, including computer vision, speech recognition and production, and playing complex games. Practical applications are already in broad use, in areas such as computer vision and speech and text translation, and large-scale efforts are under way in many other areas. Here, I discuss how additional aspects of brain circuitry could supply cues for guiding network models toward broader aspects of cognition and general AI.

}, issn = {0036-8075}, doi = {10.1126/science.aau6595}, url = {http://www.sciencemag.org/lookup/doi/10.1126/science.aau6595}, author = {Shimon Ullman} } @conference {4545, title = {A Virtual Reality Experimental Approach for Studying How the Brain Implements Attentive Behaviors}, booktitle = {Tri-Institute 2019 Gateways to the Laboratory Summer Program}, year = {2019}, month = {08/2019}, address = {New York, NY, USA}, author = {Phillips-Jones, Taylor and Otero Coronel, Santiago and Sani, Ilaria and W. A. Freiwald} } @proceedings {4390, title = {Visual Concept-Metaconcept Learning}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

Humans reason with concepts and metaconcepts: we recognize red and blue from visual input; we also understand that they are colors, i.e., red is an instance of color. In this paper, we propose the visual concept-metaconcept learner (VCML) for joint learning of concepts and metaconcepts from images and associated question-answer pairs. The key is to exploit the bidirectional connection between visual concepts and metaconcepts. Visual representations provide grounding cues for predicting relations between unseen pairs of concepts. Knowing that red and blue are instances of color, we generalize to the fact that green is also an instance of color since they all categorize the hue of objects. Meanwhile, knowledge about metaconcepts empowers visual concept learning from limited, noisy, and even biased data. From just a few examples of purple cubes we can understand a new color purple, which resembles the hue of the cubes instead of the shape of them. Evaluation on both synthetic and real-world datasets validates our claims.

}, author = {Chi Han and Jiayuan Mao and Chuang Gan and Joshua B. Tenenbaum and Jiajun Wu} } @conference {4536, title = {Visual Features for Invariant Coding by Face Selective Neurons }, booktitle = {2019 Conference on Cognitive Computational Neuroscience (CCN)}, year = {2019}, month = {09/2019}, address = {Berlin, Germany}, author = {Zarco, Wilbert and W. A. Freiwald} } @conference {4517, title = {Weight and Batch Normalization implement Classical Generalization Bounds }, booktitle = {ICML}, year = {2019}, month = {06/2019}, address = {Long Beach/California}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Jack Hidary and Tomaso Poggio} } @inbook {4459, title = {What do neurons really want? The role of semantics in cortical representations.}, booktitle = {Psychology of Learning and Motivation}, volume = {70}, year = {2019}, chapter = {7}, abstract = {

What visual inputs best trigger activity for a given neuron in cortex and what type of semantic information may guide those neuronal responses? We revisit the methodol- ogies used so far to design visual experiments, and what those methodologies have taught us about neural coding in visual cortex. Despite heroic and seminal work in ventral visual cortex, we still do not know what types of visual features are optimal for cortical neurons. We briefly review state-of-the-art standard models of visual recog- nition and argue that such models should constitute the null hypothesis for any measurement that purports to ascribe semantic meaning to neuronal responses. While it remains unclear when, where, and how abstract semantic information is incorporated in visual neurophysiology, there exists clear evidence of top-down modulation in the form of attention, task-modulation and expectations. Such top-down signals open the doors to some of the most exciting questions today toward elucidating how abstract knowledge can be incorporated into our models of visual processing.

}, doi = {https://doi.org/10.1016/bs.plm.2019.03.005}, author = {Gabriel Kreiman} } @proceedings {4384, title = {Write, Execute, Assess: Program Synthesis with a REPL}, year = {2019}, month = {11/2019}, address = {Vancouver, Canada}, abstract = {

We present a neural program synthesis approach integrating components which write, execute, and assess code to navigate the search space of possible programs. We equip the search process with an interpreter or a read-eval-print-loop (REPL), which immediately executes partially written programs, exposing their semantics. The REPL addresses a basic challenge of program synthesis: tiny changes in syntax can lead to huge changes in semantics. We train a pair of models, a policy that proposes the new piece of code to write, and a value function that assesses the prospects of the code written so-far. At test time we can combine these models with a Sequential Monte Carlo algorithm. We apply our approach to two domains: synthesizing text editing programs and inferring 2D and 3D graphics programs.

}, author = {Kevin Ellis and Maxwell Nye and Yewen Pu and Felix Sosa and Joshua B. Tenenbaum and Armando Solar-Lezama} } @article {3563, title = {Adaptive Coding for Dynamic Sensory Inference}, journal = {eLife}, year = {2018}, month = {07/2018}, abstract = {

Behavior relies on the ability of sensory systems to infer properties of the environment from incoming stimuli. The accuracy of inference depends on the fidelity with which behaviorally-relevant properties of stimuli are encoded in neural responses. High-fidelity encodings can be metabolically costly, but low-fidelity encodings can cause errors in inference. Here, we discuss general principles that underlie the tradeoff between encoding cost and inference error. We then derive adaptive encoding schemes that dynamically navigate this tradeoff. These optimal encodings tend to increase the fidelity of the neural representation following a change in the stimulus distribution, and reduce fidelity for stimuli that originate from a known distribution. We predict dynamical signatures of such encoding schemes and demonstrate how known phenomena, such as burst coding and firing rate adaptation, can be understood as hallmarks of optimal coding for accurate inference.

Link to bioRxiv preprint: https://www.biorxiv.org/content/early/2018/04/01/189506

}, author = {Wiktor Mlynarski and Ann M. Hermundstad} } @article {3315, title = {An analysis of training and generalization errors in shallow and deep networks}, year = {2018}, month = {02/2018}, abstract = {

An open problem around deep networks is the apparent absence of over-fitting despite large over-parametrization which allows perfect fitting of the training data. In this paper, we explain this phenomenon when each unit evaluates a trigonometric polynomial. It is well understood in the theory of function approximation that ap- proximation by trigonometric polynomials is a {\textquotedblleft}role model{\textquotedblright} for many other processes of approximation that have inspired many theoretical constructions also in the context of approximation by neural and RBF networks. In this paper, we argue that the maximum loss functional is necessary to measure the generalization error. We give estimates on exactly how many parameters ensure both zero training error as well as a good generalization error, and how much error to expect at which test data. An interesting feature of our new method is that the variance in the training data is no longer an insurmountable lower bound on the generalization error.

}, keywords = {deep learning, generalization error, interpolatory approximation}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @proceedings {3651, title = {Assessing Language Proficiency from Eye Movements in Reading}, year = {2018}, month = {06/2018}, address = {New Orleans}, keywords = {Computation, language}, url = {http://naacl2018.org/}, author = {Yevgeni Berzak and Boris Katz and Roger Levy} } @article {3773, title = {At 4.5 but not 5.5 years, children favor kin when the stakes are moderately high}, journal = {PLOS ONE}, volume = {13}, year = {2018}, month = {08/2018}, chapter = {e0202507}, doi = {10.1371/journal.pone.0202507}, url = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0202507}, author = {A C Spokes and Elizabeth S Spelke} } @article {4162, title = {Biologically-plausible learning algorithms can scale to large datasets}, year = {2018}, month = {11/2018}, abstract = {

The backpropagation (BP) algorithm is often thought to be biologically implausible in the brain. One of the main reasons is that BP requires symmetric weight matrices in the feedforward and feedback pathways. To address this "weight transport problem" (Grossberg, 1987), two more biologically plausible algorithms, proposed by Liao et al. (2016) and Lillicrap et al. (2016), relax BP{\textquoteright}s weight symmetry requirements and demonstrate comparable learning capabilities to that of BP on small datasets. However, a recent study by Bartunov et al. (2018) evaluate variants of target-propagation (TP) and feedback alignment (FA) on MINIST, CIFAR, and ImageNet datasets, and find that although many of the proposed algorithms perform well on MNIST and CIFAR, they perform significantly worse than BP on ImageNet. Here, we additionally evaluate the sign-symmetry algorithm (Liao et al., 2016), which differs from both BP and FA in that the feedback and feedforward weights share signs but not magnitudes. We examine the performance of sign-symmetry and feedback alignment on ImageNet and MS COCO datasets using different network architectures (ResNet-18 and AlexNet for ImageNet, RetinaNet for MS COCO). Surprisingly, networks trained with sign-symmetry can attain classification performance approaching that of BP-trained networks. These results complement the study by Bartunov et al. (2018), and establish a new benchmark for future biologically plausible learning algorithms on more difficult datasets and more complex architectures.

}, author = {Will Xiao and Honglin Chen and Qianli Liao and Tomaso Poggio} } @article {3398, title = {Brain-Observatory-Toolbox}, year = {2018}, month = {01/2018}, abstract = {

A MATLAB toolbox for interacting with the Allen Brain Observatory.

Software can be accessed and downloaded from the Github repo here - https://github.com/emeyers/Brain-Observatory-Toolbox

}, author = {Dylan Muir and Xinzhu Fang and Ethan Meyers} } @article {4294, title = {Brain-Score: Which Artificial Neural Network for Object Recognition is most Brain-Like?}, journal = {bioRxiv preprint}, year = {2018}, abstract = {

The internal representations of early deep artificial neural networks (ANNs) were found to be remarkably similar to the internal neural representations measured experimentally in the primate brain. Here we ask, as deep ANNs have continued to evolve, are they becoming more or less brain-like? ANNs that are most functionally similar to the brain will contain mechanisms that are most like those used by the brain. We therefore developed Brain-Score {\textendash} a composite of multiple neural and behavioral benchmarks that score any ANN on how similar it is to the brain{\textquoteright}s mechanisms for core object recognition {\textendash} and we deployed it to evaluate a wide range of state-of-the-art deep ANNs. Using this scoring system, we here report that: (1) DenseNet-169, CORnet-S and ResNet-101 are the most brain-like ANNs. There remains considerable variability in neural and behavioral responses that is not predicted by any ANN, suggesting that no ANN model has yet captured all the relevant mechanisms. (3) Extending prior work, we found that gains in ANN ImageNet performance led to gains on Brain-Score. However, correlation weakened at >= 70\% top-1 ImageNet performance, suggesting that additional guidance from neuroscience is needed to make further advances in capturing brain mechanisms. (4) We uncovered smaller (i.e. less complex) ANNs that are more brain-like than many of the best-performing ImageNet models, which suggests the opportunity to simplify ANNs to better understand the ventral stream. The scoring system used here is far from complete. However, we propose that evaluating and tracking model-benchmark correspondences through a Brain-Score that is regularly updated with new brain data is an exciting opportunity: experimental benchmarks can be used to guide machine network evolution, and machine networks are mechanistic hypotheses of the brain{\textquoteright}s network and thus drive next experiments. To facilitate both of these, we release Brain-Score.org: a platform that hosts the neural and behavioral benchmarks, where ANNs for visual processing can be submitted to receive a Brain-Score and their rank relative to other models, and where new experimental data can be naturally incorporated.

}, keywords = {computational neuroscience, deep learning, Neural Networks, object recognition, ventral stream}, doi = {10.1101/407007}, url = {https://www.biorxiv.org/content/10.1101/407007v1}, author = {Martin Schrimpf and Jonas Kubilius}, editor = {Ha Hong and Najib J. Majaj and Rishi Rajalingham and Elias B. Issa and Kohitij Kar and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Daniel L K Yamins and James J. DiCarlo} } @article {3962, title = {Can Deep Neural Networks Do Image Segmentation by Understanding Insideness?}, year = {2018}, month = {12/2018}, abstract = {

THIS MEMO IS REPLACED BY CBMM MEMO 105

A key component of visual cognition is the understanding of spatial relationships among objects. Albeit effortless to our visual system, state-of-the-art artificial neural networks struggle to distinguish basic spatial relationships among elements in an image. As shown here, deep neural networks (DNNs) trained with hundreds of thousands of labeled examples cannot accurately distinguish whether pixels lie inside or outside 2D shapes, a problem that seems much simpler than image segmentation. In this paper, we sought to analyze the capability of ANN to solve such inside/outside problems using an analytical approach. We demonstrate that it is a mathematically tractable problem and that two previously proposed algorithms, namely the Ray-Intersection Method and the Coloring Method, achieve perfect accuracy when implemented in the form of DNNs.

}, author = {Kimberly M. Villalobos and Jamel Dozier and Vilim Stih and Andrew Francl and Frederico Azevedo and Tomaso Poggio and Tomotake Sasaki and Xavier Boix} } @article {3703, title = {Classical generalization bounds are surprisingly tight for Deep Networks}, year = {2018}, month = {07/2018}, abstract = {

Deep networks are usually trained and tested in a regime in which the training classification error is not a good predictor of the test error. Thus the consensus has been that generalization, defined as convergence of the empirical to the expected error, does not hold for deep networks. Here we show that, when normalized appropriately after training, deep networks trained on exponential type losses show a good linear dependence of test loss on training loss. The observation, motivated by a previous theoretical analysis of overparametrization and overfitting, not only demonstrates the validity of classical generalization bounds for deep learning but suggests that they are tight. In addition, we also show that the bound of the classification error by the normalized cross entropy loss is empirically rather tight on the data sets we studied.

}, author = {Qianli Liao and Brando Miranda and Jack Hidary and Tomaso Poggio} } @article {3536, title = {Comparing human and monkey neural circuits for processing social scenes}, year = {2018}, month = {08/2018}, address = {Seattle, WA}, url = {http://www.cnsorg.org/cns-2018}, author = {Julia Sliwa and S.R. Marvel and G.A. Ianni and W. A. Freiwald} } @article {3535, title = {Comparing human and monkey neural circuits for processing social scenes}, year = {2018}, month = {05/2018}, address = {Brooklyn, NY }, url = {http://www.socialaffectiveneuro.org/conferences.html}, author = {Julia Sliwa and S.R. Marvel and G.A. Ianni and W. A. Freiwald} } @article {4148, title = {Comparing human and monkey neural circuits for processing social scenes}, year = {2018}, month = {11/2017}, author = {Julia Sliwa and S. R. Marvel and G.A. Ianni and W. A. Freiwald} } @article {4149, title = {Comparing human and monkey neural circuits for processing social scenes}, year = {2018}, author = {Julia Sliwa and S. R. Marvel and G.A. Ianni and W. A. Freiwald} } @inbook {4004, title = {Compressed Learning for Image Classification: A Deep Neural Network Approach}, booktitle = {Handbook of Numerical Analysis}, volume = {19}, year = {2018}, month = {10/2018}, pages = {3 - 17}, publisher = {Elsevier}, organization = {Elsevier}, abstract = {

Compressed learning (CL) is a joint signal processing and machine learning framework for inference from a signal, using a small number of measurements obtained by a linear projection. In this chapter, we review this concept of compressed leaning, which suggests that learning directly in the compressed domain is possible, and with good performance. We experimentally show that the classification accuracy, using an efficient classifier in the compressed domain, can be quite close to the accuracy obtained when operating directly on the original data. Using convolutional neural network for the image classification, we examine the performance of different linear sensing schemes for the data acquisition stage, such as random sensing and PCA projection. Then, we present an end-to-end deep learning approach for CL, in which a network composed of fully connected layers followed by convolutional ones, performs the linear sensing and the nonlinear inference stages simultaneously. During the training phase, both the sensing matrix and the nonlinear inference operator are jointly optimized, leading to a suitable sensing matrix and better performance for the overall task of image classification in the compressed domain. The performance of the proposed approach is demonstrated using the MNIST and CIFAR-10 datasets.

Full text available online - https://books.google.com/books?hl=en\&lr=\&id=zDx4DwAAQBAJ\&oi=fnd\&pg=PA3\&ots=vxCX2Ddl0f\&sig=RNZB40wA-2EFLjOpkazg8cnWyYo$\#$v=onepage\&q\&f=false

}, keywords = {Compressed learning, Compressed sensing, deep learning, Neural Networks, sparse coding, Sparse representation}, isbn = {9780444642059}, issn = {15708659}, doi = {10.1016/bs.hna.2018.08.002}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1570865918300024}, author = {Zisselman, E. and Amir Adler and Elad, M.} } @article {3331, title = {A computational perspective of the role of Thalamus in cognition}, journal = {arxiv}, year = {2018}, abstract = {
Thalamus has traditionally been considered as only a relay source of cortical inputs, with hierarchically organized cortical circuits serially transforming thalamic signals to cognitively-relevant representations. Given the absence of local excitatory connections within the thalamus, the notion of thalamic {\textquoteleft}relay{\textquoteright} seemed like a reasonable description over the last several decades. Recent advances in experimental approaches and theory provide a broader perspective on the role of the thalamus in cognitively-relevant cortical computations, and suggest that only a subset of thalamic circuit motifs fit the relay description. Here, we discuss this perspective and highlight the potential role for the thalamus in dynamic selection of cortical representations through a combination of intrinsic thalamic computations and output signals that change cortical network functional parameters. We suggest that through the contextual modulation of cortical computation, thalamus and cortex jointly optimize the information/cost tradeoff in an emergent fashion. We emphasize that coordinated experimental and theoretical efforts will provide a path to understanding the role of the thalamus in cognition, along with an understanding to augment cognitive capacity in health and disease. 
}, keywords = {Artificial Intelligence, Cognitive Computing, Multi-objective Optimization, Recurrent Neural Network, Reservoir Computing, Thalamo-cortical system}, url = {https://arxiv.org/abs/1803.00997}, author = {Nima Dehghani and Ralf Wimmer} } @article {3407, title = {Constant Modulus Algorithms via Low-Rank Approximation}, year = {2018}, month = {04/2018}, abstract = {

We present a novel convex-optimization-based approach to the solutions of a family of problems involving constant modulus signals. The family of problems includes the constant modulus and the constrained constant modulus, as well as the modified constant modulus and the constrained modified constant modulus. The usefulness of the proposed solutions is demonstrated for the tasks of blind beamforming and blind multiuser detection. The performance of these solutions, as we demonstrate by simulated data, is superior to existing methods.

}, keywords = {Constant modulus, convex optimization, trace norm}, author = {Amir Adler and Mati Wax} } @conference {4005, title = {Constant Modulus Beamforming Via Low-Rank Approximation}, booktitle = {2018 IEEE Statistical Signal Processing Workshop (SSP)}, year = {2018}, address = {Freiburg im Breisgau, Germany}, abstract = {

We present novel convex-optimization-based solutions to the problem of blind beamforming of constant modulus signals, and to the related problem of linearly constrained blind beamforming of constant modulus signals. These solutions are based on a low-rank approximation, ensure global optimality and are parameter free, namely, do not contain any tuneable parameters and do not require any a-priori parameter settings. The proposed approach outperforms state-of-the-art both in terms of the number of required samples for convergence, and in terms of the beamformer output SINR.

}, isbn = {978-1-5386-1571-3}, doi = {10.1109/SSP.2018.8450799}, url = {https://ieeexplore.ieee.org/document/8450799/}, author = {Amir Adler and Mati Wax} } @article {3565, title = {Co-occurrence statistics of natural sound features predict perceptual grouping}, year = {2018}, month = {03/2018}, address = {Denver, Colorado}, url = {http://www.cosyne.org/c/index.php?title=Cosyne_18}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3578, title = {Co-occurrence statistics of natural sound features predict perceptual grouping}, year = {2018}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3905, title = {Cortex Is Cortex: Ubiquitous Principles Drive Face-Domain Development}, journal = {Trends in Cognitive Sciences}, year = {2018}, month = {11/2018}, abstract = {

Powell, Kosakowski, and Saxe [1] argued in a recent review that two bottom-up models previously proposed to account for the development of face domains in inferotemporal cortex (IT) 2, 3 are insufficient to explain the existing data. They proposed instead that face domains are predisposed to process faces via selective connectivity to social information in medial prefrontal cortex. Here we explain why activity-dependent mechanisms acting on a retinotopic proto-architecture provide a sufficient explanation for the development of face, and other category, domains...

}, keywords = {development, face domains, self-organizing systems}, issn = {13646613}, doi = {10.1016/j.tics.2018.10.009}, url = {https://linkinghub.elsevier.com/retrieve/pii/S1364661318302572}, author = {Margaret S Livingstone and Michael J Arcaro and Peter F Schade} } @article {3619, title = {Deep Nets: What have they ever done for Vision?}, year = {2018}, month = {05/2018}, abstract = {

This is an opinion paper about the strengths and weaknesses of Deep Nets. They are at the center of recent progress on Artificial Intelligence and are of growing importance in Cognitive Science and Neuroscience since they enable the development of computational models that can deal with a large range of visually realistic stimuli and visual tasks. They have clear limitations but they also have enormous successes. There is also gradual, though incomplete, understanding of their inner workings. It seems unlikely that Deep Nets in their current form will be the best long-term solution either for building general purpose intelligent machines or for understanding the mind/brain, but it is likely that many aspects of them will remain. At present Deep Nets do very well on specific types of visual tasks and on specific benchmarked datasets. But Deep Nets are much less general purpose, flexible, and adaptive than the human visual system. Moreover, methods like Deep Nets may run into fundamental difficulties when faced with the enormous complexity of natural images. To illustrate our main points, while keeping the references small, this paper is slightly biased towards work from our group.

}, author = {Alan Yuille and Chenxi Liu} } @article {3634, title = {Deep Regression Forests for Age Estimation}, year = {2018}, month = {06/2018}, abstract = {

Age estimation from facial images is typically cast as a nonlinear regression problem. The main challenge of this problem is the facial feature space w.r.t. ages is inhomogeneous, due to the large variation in facial appearance across different persons of the same age and the non-stationary property of aging patterns. In this paper, we propose Deep Regression Forests (DRFs), an end-to-end model, for age estimation. DRFs connect the split nodes to a fully connected layer of a convolutional neural network (CNN) and deal with inhomogeneous data by jointly learning input-dependant data partitions at the split nodes and data abstractions at the leaf nodes. This joint learning follows an alternating strategy: First, by fixing the leaf nodes, the split nodes as well as the CNN parameters are optimized by Back-propagation; Then, by fixing the split nodes, the leaf nodes are optimized by iterating a step-size free update rule derived from Variational Bounding. We verify the proposed DRFs on three standard age estimation benchmarks and achieve state-of-the-art results on all of them.

}, author = {Wei Shen and Yilu Guo and Yan Wang and Kai Zhao and Bo Wang and Alan Yuille} } @conference {4112, title = {Deep sequential models for sampling-based planning}, booktitle = {The IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS 2018)}, year = {2018}, month = {10/2018 }, address = {Madrid, Spain}, abstract = {

We demonstrate how a sequence model and asampling-based planner can influence each other to produceefficient plans and how such a model can automatically learnto take advantage of observations of the environment. Sampling-based planners such as RRT generally know nothing of theirenvironments even if they have traversed similar spaces manytimes. A sequence model, such as an HMM or LSTM, guidesthe search for good paths. The resulting model, called DeRRT*,observes the state of the planner and the local environment tobias the next move and next planner state. The neural-network-based models avoid manual feature engineering by co-traininga convolutional network which processes map features andobservations from sensors. We incorporate this sequence modelin a manner that combines its likelihood with the existing biasfor searching large unexplored Voronoi regions. This leads tomore efficient trajectories with fewer rejected samples even indifficult domains such as when escaping bug traps. This modelcan also be used for dimensionality reduction in multi-agentenvironments with dynamic obstacles. Instead of planning in ahigh-dimensional space that includes the configurations of theother agents, we plan in a low-dimensional subspace relying onthe sequence model to bias samples using the observed behaviorof the other agents. The techniques presented here are general,include both graphical models and deep learning approaches,and can be adapted to a range of planners.

}, doi = {10.1109/IROS.2018.8593947}, url = {https://ieeexplore.ieee.org/document/8593947}, author = {Yen-Ling Kuo and Andrei Barbu and Boris Katz} } @article {3269, title = {Deep-learning tomography}, journal = {The Leading Edge}, volume = {37}, year = {2018}, month = {01/2018}, pages = {58 - 66}, abstract = {

Velocity model building (VMB) is a key step in hydrocarbon exploration; The VMB main product is an initial model of the subsurface that is subsequently used in seismic imaging and interpretation workflows. Reflection or refraction Tomography and full waveform inversion (FWI) are the most commonly used techniques in VMB. On one hand, Tomography is a time-consuming activity that relies on successive updates of highly human-curated analysis of gathers. On the other hand, FWI is very computationally demanding with no guarantees of global convergence.

We propose and implement a novel concept that bypasses these demanding steps, directly producing an accurate gridding or layered velocity model from shot gathers. Our approach relies on training deep neural networks; the resulting predictive model maps relationships between the data space and the final output (particularly, the presence of high velocity segments that might indicate salt formations). In term of time, the training task takes a few hours for 2D data, but the inference step (predicting a model from previously unseen data) takes only seconds.

The promising results shown here for synthetic 2D data demonstrate a new way of using seismic data and suggests fast turnaround of workflows that now make use of machine learning approaches to identify key structures in the subsurface.

}, keywords = {algorithm, full waveform inversion, Neural Networks, NMO, tomography Read More: https://library.seg.org/doi/abs/10.1190/tle37010058.1}, issn = {1070-485X}, doi = {10.1190/tle37010058.1}, url = {https://library.seg.org/doi/10.1190/tle37010058.1}, author = {Mauricio Araya-Polo and Joseph Jennings and Amir Adler and Dahlke, Taylor} } @article {3511, title = {DeepVoting: A Robust and Explainable Deep Network for Semantic Part Detection under Partial Occlusion}, year = {2018}, month = {06/2018}, abstract = {

In this paper, we study the task of detecting semantic parts of an object, e.g., a wheel of a car, under partial occlusion. We propose that all models should be trained without seeing occlusions while being able to transfer the learned knowledge to deal with occlusions. This setting alleviates the diffi- culty in collecting an exponentially large dataset to cover occlusion patterns and is more essential. In this scenario, the proposal-based deep networks, like RCNN-series, often produce unsatisfactory re- sults, because both the proposal extraction and classification stages may be confused by the irrelevant occluders. To address this, [25] proposed a voting mechanism that combines multiple local visual cues to detect semantic parts. The semantic parts can still be detected even though some visual cues are missing due to occlusions. However, this method is manually-designed, thus is hard to be optimized in an end-to-end manner.

In this paper, we present DeepVoting, which incorporates the robustness shown by [25] into a deep network, so that the whole pipeline can be jointly optimized. Specifically, it adds two layers after the intermediate features of a deep network, e.g., the pool-4 layer of VGGNet. The first layer extracts the evidence of local visual cues, and the second layer performs a voting mechanism by utilizing the spatial relationship between visual cues and semantic parts. We also propose an improved version DeepVoting+ by learning visual cues from context outside objects. In experiments, DeepVoting achieves significantly better performance than several baseline methods, including Faster-RCNN, for semantic part detection under occlusion. In addition, DeepVoting enjoys explainability as the detection results can be diagnosed via looking up the voting cues.

}, author = {Zhishuai Zhang and Cihang Xie and Jianyu Wang and Lingxi Xie and Alan Yuille} } @conference {3548, title = {DeepVoting: An Explainable Framework for Semantic Part Detection under Partial Occlusion}, booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)}, year = {2018}, month = {06/2018}, address = {Salt Lake City, Utah}, abstract = {

In this paper, we study the task of detecting semantic parts of an object, e.g., a wheel of a car, under partial occlusion. We propose that all models should be trained without seeing occlusions while being able to transfer the learned knowledge to deal with occlusions. This setting alleviates the difficulty in collecting an exponentially large dataset to cover occlusion patterns and is more essential. In this scenario, the proposal-based deep networks, like RCNN-series, often produce unsatisfactory results, because both the proposal extraction and classification stages may be confused by the irrelevant occluders. To address this, [25] proposed a voting mechanism that combines multiple local visual cues to detect semantic parts. The semantic parts can still be detected even though some visual cues are missing due to occlusions. However, this method is manually-designed, thus is hard to be optimized in an end-to-end manner. In this paper, we present DeepVoting, which incorporates the robustness shown by [25] into a deep network, so that the whole pipeline can be jointly optimized. Specifically, it adds two layers after the intermediate features of a deep network, e.g., the pool-4 layer of VGGNet. The first layer extracts the evidence of local visual cues, and the second layer performs a voting mechanism by utilizing the spatial relationship between visual cues and semantic parts. We also propose an improved version DeepVoting+ by learning visual cues from context outside objects. In experiments, DeepVoting achieves significantly better performance than several baseline methods, including Faster-RCNN, for semantic part detection under occlusion. In addition, DeepVoting enjoys explainability as the detection results can be diagnosed via looking up the voting cues.

}, url = {http://cvpr2018.thecvf.com/}, author = {Zhishuai Zhang and Cihang Xie and Jianyu Wang and Lingxi Xie and Alan Yuille} } @proceedings {3519, title = {Development of automated interictal spike detector}, year = {2018}, month = {07/2018}, address = {Honolulu, HI}, url = {https://embc.embs.org/2018/}, author = {A Palepu and Gabriel Kreiman} } @proceedings {4179, title = {Differentiable physics and stable modes for tool-use and manipulation planning}, year = {2018}, month = {06/2018}, abstract = {

We consider the problem of sequential manipulation and tool-use planning in domains that include physical interactions such as hitting and throwing. The approach integrates a Task And Motion Planning formulation with primitives that either impose stable kinematic constraints or differentiable dynamical and impulse exchange constraints at the path optimization level. We demonstrate our approach on a variety of physical puzzles that involve tool use and dynamic interactions. We then compare manipulation sequences generated by our approach to human actions on analogous tasks, suggesting future directions and illuminating current limitations.

}, author = {Marc Toussaint and Kelsey Allen and Kevin A Smith and Joshua B. Tenenbaum} } @article {3422, title = {Discovery and usage of joint attention in images}, journal = {arXiv.org}, year = {2018}, month = {04/2018}, abstract = {

Joint visual attention is characterized by two or more individuals looking at a common target at the same time. The ability to identify joint attention in scenes, the people involved, and their common target, is fundamental to the understanding of social interactions, including others{\textquoteright} intentions and goals. In this work we deal with the extraction of joint attention events, and the use of such events for image descriptions. The work makes two novel contributions. First, our extraction algorithm is the first which identifies joint visual attention in single static images. It computes 3D gaze direction, identifies the gaze target by combining gaze direction with a 3D depth map computed for the image, and identifies the common gaze target. Second, we use a human study to demonstrate the sensitivity of humans to joint attention, suggesting that the detection of such a configuration in an image can be useful for understanding the image, including the goals of the agents and their joint activity, and therefore can contribute to image captioning and related tasks.

}, keywords = {compositional approach, computational study, Gaze perception, human study, joint attention}, url = {https://arxiv.org/abs/1804.04604}, author = {Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} } @article {3896, title = {Dynamic population coding and its relationship to working memory}, journal = {Journal of Neurophysiology}, volume = {120}, year = {2018}, month = {10/2018}, pages = {2260 - 2268}, abstract = {

For over 45 years, neuroscientists have conducted experiments aimed at understanding the neural basis of working memory. Early results examining individual neurons highlighted that information is stored in working memory in persistent sustained activity where neurons maintained elevated firing rates over extended periods of time. However, more recent work has emphasized that information is often stored in working memory in dynamic population codes, where different neurons contain information at different periods in time. In this paper, I review findings that show that both sustained activity as well as dynamic codes are present in the prefrontal cortex and other regions during memory delay periods. I also review work showing that dynamic codes are capable of supporting working memory and that such dynamic codes could easily be {\textquotedblleft}readout{\textquotedblright} by downstream regions. Finally, I discuss why dynamic codes could be useful for enabling animals to solve tasks that involve working memory. Although additional work is still needed to know definitively whether dynamic coding is critical for working memory, the findings reviewed here give insight into how different codes could contribute to working memory, which should be useful for guiding future research.

}, issn = {0022-3077}, doi = {10.1152/jn.00225.2018}, url = {https://www.physiology.org/doi/10.1152/jn.00225.2018}, author = {Ethan Meyers} } @article {3540, title = {Efficient inverse graphics in biological face processing}, year = {2018}, month = {04/02/2018}, abstract = {

The visual system must not only recognize and localize objects, but perform much richer inferences about the underlying causes in the world that give rise to observed sense data. Analyzing scenes by inverting causal generative models, also known as "analysis-by-synthesis", has a long history in computational vision, and these models have some behavioral support, but they are typically too slow to support online perception and have no known mapping to actual neural circuits. Here we present a neurally plausible model for efficiently inverting generative models of images and test it as a precise account of one aspect of high-level vision, the perception of faces. The model is based on a deep neural network that learns to invert a three-dimensional (3D) face graphics program in a single fast feedforward pass. It successfully explains both human behavioral data and multiple levels of neural processing in non-human primates, as well as a classic illusion, the "hollow face" effect. The model also fits qualitatively better than state-of-the-art computer vision models, and suggests an interpretable reverse-engineering account of how images are transformed into scene percepts in the primate ventral stream.

}, url = {https://www.biorxiv.org/content/early/2018/04/02/282798}, author = {Ilker Yildirim and W. A. Freiwald and Tenenbaum J.} } @article {4181, title = {End-to-end differentiable physics for learning and control}, year = {2018}, author = {Filipe de Avila Belbute-Peres and Kevin A Smith and Kelsey Allen and Joshua B. Tenenbaum and Zico Kolter} } @article {3452, title = {A fast, invariant representation for human action in the visual system}, journal = {Journal of Neurophysiology}, year = {2018}, abstract = {

Humans can effortlessly recognize others{\textquoteright} actions in the presence of complex transformations, such as changes in viewpoint. Several studies have located the regions in the brain involved in invariant action recognition; however, the underlying neural computations remain poorly understood. We use magnetoencephalography decoding and a data set of well-controlled, naturalistic videos of five actions (run, walk, jump, eat, drink) performed by different actors at different viewpoints to study the computational steps used to recognize actions across complex transformations. In particular, we ask when the brain discriminates between different actions, and when it does so in a manner that is invariant to changes in 3D viewpoint. We measure the latency difference between invariant and noninvariant action decoding when subjects view full videos as well as form-depleted and motion-depleted stimuli. We were unable to detect a difference in decoding latency or temporal profile between invariant and noninvariant action recognition in full videos. However, when either form or motion information is removed from the stimulus set, we observe a decrease and delay in invariant action decoding. Our results suggest that the brain recognizes actions and builds invariance to complex transformations at the same time and that both form and motion information are crucial for fast, invariant action recognition.

Associated Dataset: MEG action recognition data

}, doi = {https://doi.org/10.1152/jn.00642.2017}, url = {https://www.physiology.org/doi/10.1152/jn.00642.2017}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {3827, title = {Finding any Waldo with zero-shot invariant and efficient visual search}, journal = {Nature Communications}, volume = {9}, year = {2018}, month = {09/2018}, abstract = {

Searching for a target object in a cluttered scene constitutes a fundamental challenge in daily vision. Visual search must be selective enough to discriminate the target from distractors, invariant to changes in the appearance of the target, efficient to avoid exhaustive exploration of the image, and must generalize to locate novel target objects with zero-shot training. Previous work on visual search has focused on searching for perfect matches of a target after extensive category-specific training. Here, we show for the first time that humans can efficiently and invariantly search for natural objects in complex scenes. To gain insight into the mechanisms that guide visual search, we propose a biologically inspired computational model that can locate targets without exhaustive sampling and which can generalize to novel objects. The model provides an approximation to the mechanisms integrating bottom-up and top-down signals during search in natural scenes.

}, doi = {10.1038/s41467-018-06217-x}, url = {http://www.nature.com/articles/s41467-018-06217-x}, author = {Zhang, Mengmi and Feng, Jiashi and Ma, Keng Teck and Lim, Joo Hwee and Qi Zhao and Gabriel Kreiman} } @article {3589, title = {From Map Reading to Geometric Intuitions}, journal = {Developmental Psychology}, year = {2018}, month = {03/2018}, abstract = {

The origins and development of our geometric intuitions have been debated for millennia. The present study links children{\textquoteright}s developing intuitions about the properties of planar triangles to their developing abilities to read purely geometric maps. Six-year-old children are limited when navigating by maps that depict only the sides of a triangle in an environment composed of only the triangle{\textquoteright}s corners and vice versa. Six-year-old children also incorrectly judge how the angle size of the third corner of a triangle varies with changes to the other two corners. These limitations in map reading and in judgments about triangles are attenuated, respectively, by 10 and 12 years of age. Moreover, as children get older, their map reading predicts their geometric judgments on the triangle task. Map reading thus undergoes developmental changes that parallel an emerging capacity to reason explicitly about the distance and angle relations essential to euclidean geometry. (PsycINFO Database Record (c) 2018 APA, all rights reserved)

Supplemental materials: http://dx.doi.org/10.1037/dev0000509.supp
}, keywords = {euclidean geometry, mathematical cognition, spatial cognition, spatial symbols}, issn = {0012-1649}, doi = {http://dx.doi.org/10.1037/dev0000509}, url = {http://psycnet.apa.org/record/2018-12810-001}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {2548, title = {Full interpretation of minimal images.}, journal = {Cognition}, volume = {171}, year = {2018}, month = {02/2018}, pages = {65-84}, chapter = {65}, abstract = {

The goal in this work is to model the process of\  {\textquoteleft}full interpretation{\textquoteright}\  of\  object images,\  which is the ability to identify and localize all semantic features and parts that are recognized by human observers.\  The task is approached\  by dividing the interpretation of\  the complete object to the interpretation of multiple reduced but interpretable local\  regions. In such reduced regions, interpretation is\  simpler,\  since the number of\  se mantic\  components is small, and the variability of possible configurations is low.\ 

We model the interpretation process by identifying primitive components and\  relations that play a useful role in\  local\  interpretation by humans. To identify useful\  compo nents and relations used in the interpretation process, we consider the\  interpretation of\  {\textquoteleft} minimal configurations{\textquoteright} :\  these\  are\  reduced\  local regions , whic h are\  minimal in the sense that further reduction\  renders them unrecognizable and\  uninterpretable.\  We show that\  such\  minimal\  interpretable image s have useful properties,\  which\  we use to identify\  informative\  features and relations used for full interpretation.\  We describe our interpretation model, and show results of\  detailed\  interpretations\  of\  minimal c onfigurations, produced automatically by the model. Finally, we\  discuss\  implications of\  full\  interpretation\  to\  difficult visual tasks, such as recognizing human\  activities or interactions , which are beyond the scope of current models of visual\  recognition .

}, keywords = {Image interpretation, M inimal images, Parts and relations, Top-down processing}, doi = {https://doi.org/10.1016/j.cognition.2017.10.006}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @article {4107, title = {Full interpretation of minimal images}, journal = {Cognition}, volume = {171}, year = {2018}, month = {01/2018}, pages = {65 - 84}, abstract = {

The goal in this work is to model the process of {\textquoteleft}full interpretation{\textquoteright} of object images, which is the ability to identify and localize all semantic features and parts that are recognized by human observers. The task is approached by dividing the interpretation of the complete object to the interpretation of multiple reduced but interpretable local regions. In such reduced regions, interpretation is simpler, since the number of semantic components is small, and the variability of possible configurations is low.

We model the interpretation process by identifying primitive components and relations that play a useful role in local interpretation by humans. To identify useful components and relations used in the interpretation process, we consider the interpretation of {\textquoteleft}minimal configurations{\textquoteright}: these are reduced local regions, which are minimal in the sense that further reduction renders them unrecognizable and uninterpretable. We show that such minimal interpretable images have useful properties, which we use to identify informative features and relations used for full interpretation. We describe our interpretation model, and show results of detailed interpretations of minimal configurations, produced automatically by the model. Finally, we discuss possible extensions and implications of full interpretation to difficult visual tasks, such as recognizing social interactions, which are beyond the scope of current models of visual recognition.

}, keywords = {Image interpretation Minimal images Parts and relations Top-down processing}, issn = {00100277}, doi = {10.1016/j.cognition.2017.10.006}, url = {https://linkinghub.elsevier.com/retrieve/pii/S001002771730269Xhttps://api.elsevier.com/content/article/PII:S001002771730269X?httpAccept=text/xmlhttps://api.elsevier.com/content/article/PII:S001002771730269X?httpAccept=text/plain}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @conference {4109, title = {Grounding language acquisition by training semantic parsersusing captioned videos}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP 2018), }, year = {2018}, month = {10/2018 }, address = {Brussels, Belgium}, abstract = {

We develop a semantic parser that is trained ina grounded setting using pairs of videos cap-tioned with sentences. This setting is bothdata-efficient, requiring little annotation, andsimilar to the experience of children wherethey observe their environment and listen tospeakers. The semantic parser recovers themeaning of English sentences despite not hav-ing access to any annotated sentences. It doesso despite the ambiguity inherent in visionwhere a sentence may refer to any combina-tion of objects, object properties, relations oractions taken by any agent in a video. For thistask, we collected a new dataset for groundedlanguage acquisition. Learning a grounded se-mantic parser {\textemdash} turning sentences into logi-cal forms using captioned videos {\textemdash} can sig-nificantly expand the range of data that parserscan be trained on, lower the effort of training asemantic parser, and ultimately lead to a betterunderstanding of child language acquisition.

}, isbn = {978-1-948087-84-1}, url = {http://aclweb.org/anthology/D18-1285}, author = {Candace Ross and Andrei Barbu and Yevgeni Berzak and Battushig Myanganbayar and Boris Katz} } @article {3576, title = {Human inference of force from impact sounds: Perceptual evidence for inverse physics}, volume = {143}, year = {2018}, month = {03/2018}, abstract = {

An impact sound is determined both by material properties of the objects involved (e.g., mass, density, shape, and rigidity) and by the force of the collision. Human listeners can typically estimate the force of an impact as well as the material which has been struck. To investigate the underlying auditory mechanisms we played listeners audio recordings of two boards being struck and measured their ability to identify the board struck with more force. Listeners significantly outperformed models based on simple acoustic features (e.g., signal power or spectral centroid). We repeated the experiment with synthetic sounds generated from simulated object resonant modes and simulated contact forces derived from a spring model. Listeners could not distinguish synthetic from real recordings and successfully estimated simulated impact force. When the synthetic modes were altered (e.g., to simulate a harder material) listeners altered their judgments of both material and impact force, consistent with the physical implications of the alteration. The results suggest that humans use resonant modes to infer object material, and use this knowledge to estimate the impact force, explaining away material contributions to the sound.

}, doi = {10.1121/1.5035721}, url = {https://asa.scitation.org/doi/abs/10.1121/1.5035721}, author = {James Traer and Josh H. McDermott} } @article {3577, title = {Human recognition of environmental sounds is not always robust to reverberation}, volume = {143}, year = {2018}, edition = {The Journal of the Acoustical Society of America }, abstract = {

Reverberation is ubiquitous in natural environments, but its effect on the recognition of non-speech sounds is poorly documented. To evaluate human robustness to reverberation, we measured its effect on the recognizability of everyday sounds. Listeners identified a diverse set of recorded environmental sounds (footsteps, animal vocalizations, vehicles moving, hammering, etc.) in an open set recognition task. For each participant, half of the sounds (randomly assigned) were presented in reverberation. We found the effect of reverberation to depend on the typical listening conditions for a sound. Sounds that are typically loud and heard in indoor environments, and which thus should often be accompanied by reverberation, were recognized robustly, with only a small impairment for reverberant conditions. In contrast, sounds that are either typically quiet or typically heard outdoors, for which reverberation should be less pronounced, produced a large recognition decrement in reverberation. These results demonstrate that humans can be remarkably robust to the distortion induced by reverberation, but that this robustness disappears when the reverberation is not consistent with the expected source properties. The results are consistent with the idea that listeners perceptually separate sound sources from reverberation, constrained by the likelihood of source-environment pairings.

}, doi = {10.1121/1.5035960}, url = {https://asa.scitation.org/doi/abs/10.1121/1.5035960}, author = {James Traer and Josh H. McDermott} } @article {4106, title = {Image interpretation above and below the object level}, journal = {Interface Focus}, volume = {8}, year = {2018}, month = {06/2018}, pages = {20180020}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivalling in some areas human-level performance. Much of the progress to date has focused on analysing the visual scene at the object level{\textemdash}the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object level, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, and their components, properties and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}. In this paper, we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, and in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteleft}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, issn = {2042-8898}, doi = {10.1098/rsfs.2018.0020}, url = {https://royalsocietypublishing.org/doi/full/10.1098/rsfs.2018.0020$\#$d3e1503}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {3627, title = {Image interpretation above and below the object level}, year = {2018}, month = {05/2018}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivaling in some areas human-level performance. Much of the progress to date has focused on analyzing the visual scene at the object level {\textendash} the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object levels, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, their components, properties, and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}.

In this paper we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteright}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, keywords = {Interaction Recognition, minimal images, Social Interactions, Visual interpretation, visual recognition}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {3620, title = {Image interpretation above and below the object level}, journal = {Proceedings of the Royal Society: Interface Focus}, year = {2018}, month = {06/2018}, abstract = {

Computational models of vision have advanced in recent years at a rapid rate, rivaling in some areas human-level performance. Much of the progress to date has focused on analyzing the visual scene at the object level {\textendash} the recognition and localization of objects in the scene. Human understanding of images reaches a richer and deeper image understanding both {\textquoteleft}below{\textquoteright} the object level, such as identifying and localizing object parts and sub-parts, as well as {\textquoteleft}above{\textquoteright} the object levels, such as identifying object relations, and agents with their actions and interactions. In both cases, understanding depends on recovering meaningful structures in the image, their components, properties, and inter-relations, a process referred here as {\textquoteleft}image interpretation{\textquoteright}.

In this paper we describe recent directions, based on human and computer vision studies, towards human-like image interpretation, beyond the reach of current schemes, both below the object level, as well as some aspects of image interpretation at the level of meaningful configurations beyond the recognition of individual objects, in particular, interactions between two people in close contact. In both cases the recognition process depends on the detailed interpretation of so-called {\textquoteright}minimal images{\textquoteright}, and at both levels recognition depends on combining {\textquoteleft}bottom-up{\textquoteright} processing, proceeding from low to higher levels of a processing hierarchy, together with {\textquoteleft}top-down{\textquoteright} processing, proceeding from high to lower levels stages of visual analysis.

}, author = {Guy Ben-Yosef and Shimon Ullman} } @article {3584, title = {Imaging the infant brain}, volume = {Kobe Japan}, year = {2018}, month = {07/2018}, url = {http://www.neuroscience2018.jnss.org/en/}, author = {Rebecca Saxe} } @article {3871, title = {Invariant Recognition Shapes Neural Representations of Visual Input}, journal = {Annual Review of Vision Science}, volume = {4}, year = {2018}, month = {10/2018}, pages = {403 - 422}, abstract = {

Recognizing the people, objects, and actions in the world around us is a crucial aspect of human perception that allows us to plan and act in our environment. Remarkably, our proficiency in recognizing semantic categories from visual input is unhindered by transformations that substantially alter their appearance (e.g., changes in lighting or position). The ability to generalize across these complex transformations is a hallmark of human visual intelligence, which has been the focus of wide-ranging investigation in systems and computational neuroscience. However, while the neural machinery of human visual perception has been thoroughly described, the computational principles dictating its functioning remain unknown. Here, we review recent results in brain imaging, neurophysiology, and computational neuroscience in support of the hypothesis that the ability to support the invariant recognition of semantic entities in the visual world shapes which neural representations of sensory input are computed by human visual cortex.

}, keywords = {computational neuroscience, Invariance, neural decoding, visual representations}, issn = {2374-4642}, doi = {10.1146/annurev-vision-091517-034103}, url = {https://www.annualreviews.org/doi/10.1146/annurev-vision-091517-034103}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @conference {3893, title = {The Language of Fake News: Opening the Black-Box of Deep Learning Based Detectors}, booktitle = {workshop on "AI for Social Good", NIPS 2018}, year = {2018}, month = {11/2018}, address = {Montreal, Canada}, abstract = {

The digital information age has generated new outlets for content creators to publish so-called {\textquotedblleft}fake news{\textquotedblright}, a new form of propaganda that is intentionally designed to mislead the reader. With the widespread effects of the fast dissemination of fake news, efforts have been made to automate the process of fake news detection. A promising solution that has come up recently is to use machine learning to detect patterns in the news sources and articles, specifically deep neural networks, which have been successful in natural language processing. However, deep networks come with lack of transparency in the decision-making process, i.e. the {\textquotedblleft}black-box problem{\textquotedblright}, which obscures its reliability. In this paper, we open this {\textquotedblleft}black-box{\textquotedblright} and we show that the emergent representations from deep neural networks capture subtle but consistent differences in the language of fake and real news: signatures of exaggeration and other forms of rhetoric. Unlike previous work, we test the transferability of the learning process to novel news topics. Our results demonstrate the generalization capabilities of deep learning to detect fake news in novel subjects only from language patterns.

}, url = {http://hdl.handle.net/1721.1/120056}, author = {Nicole O{\textquoteright}Brien and Sophia Latessa and Georgios Evangelopoulos and Xavier Boix} } @article {3562, title = {Learning Mid-Level Auditory Codes from Natural Sound Statistics}, journal = {Neural Computation}, volume = {30}, year = {2018}, month = {03/2018}, pages = {631-669}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {3513, title = {Learning physical parameters from dynamic scenes.}, journal = {Cognitive Psychology}, volume = {104}, year = {2018}, month = {8/2018}, pages = {57-82}, abstract = {

Humans acquire their most basic physical concepts early in development, and continue to enrich and expand their intuitive physics throughout life as they are exposed to more and varied dynamical environments. We introduce a hierarchical Bayesian framework to explain how people can learn physical parameters at multiple levels. In contrast to previous Bayesian models of theory acquisition (Tenenbaum et al., 2011), we work with more ex- pressive probabilistic program representations suitable for learning the forces and properties that govern how objects interact in dynamic scenes unfolding over time. We compare our model to human learners on a challenging task of estimating multiple physical parameters in novel microworlds given short movies. This task requires people to reason simultane- ously about multiple interacting physical laws and properties. People are generally able to learn in this setting and are consistent in their judgments. Yet they also make systematic errors indicative of the approximations people might make in solving this computationally demanding problem with limited computational resources. We propose two approximations that complement the top-down Bayesian approach. One approximation model relies on a more bottom-up feature-based inference scheme. The second approximation combines the strengths of the bottom-up and top-down approaches, by taking the feature-based inference as its point of departure for a search in physical-parameter space.

}, keywords = {intuitive physics, intuitive theory, learning, physical reasoning, probabilistic inference}, doi = {10.1016/j.cogpsych.2017.05.006}, url = {https://www-sciencedirect-com.libproxy.mit.edu/science/article/pii/S0010028517301822}, author = {Ullman, Tomer D. and Stuhlm{\"u}ller, Andreas and Noah D. Goodman and Joshua B. Tenenbaum} } @proceedings {3521, title = {Learning scene gist with convolutional neural networks to improve object recognition}, year = {2018}, month = {05/2018 }, address = {Princeton, NJ}, doi = {10.1109/CISS.2018.8362305}, url = {https://ieeexplore.ieee.org/abstract/document/8362305}, author = {Wu Eric and Wu Kevin and Gabriel Kreiman} } @article {4102, title = {Learning Scene Gist with Convolutional Neural Networks to Improve Object Recognition}, journal = { arXiv | Cornell University}, volume = { arXiv:1803.01967}, year = {2018}, month = {03/2018}, abstract = {Advancements in convolutional neural networks (CNNs) have made significant strides toward achieving high performance levels on multiple object recognition tasks. While some approaches utilize information from the entire scene to propose regions of interest, the task of interpreting a particular region or object is still performed independently of other objects and features in the image. Here we demonstrate that a scene{\textquoteright}s {\textquoteright}gist{\textquoteright} can significantly contribute to how well humans can recognize objects. These findings are consistent with the notion that humans foveate on an object and incorporate information from the periphery to aid in recognition. We use a biologically inspired two-part convolutional neural network ({\textquoteright}GistNet{\textquoteright}) that models the fovea and periphery to provide a proof-of-principle demonstration that computational object recognition can significantly benefit from the gist of the scene as contextual information. Our model yields accuracy improvements of up to 50\% in certain object categories when incorporating contextual gist, while only increasing the original model size by 5\%. This proposed model mirrors our intuition about how the human visual system recognizes objects, suggesting specific biologically plausible constraints to improve machine vision and building initial steps towards the challenge of scene understanding. }, url = {http://arxiv.org/abs/1803.01967}, author = {Kevin Wu and Eric Wu and Gabriel Kreiman} } @article {3514, title = {Lucky or clever? From changed expectations to attributions of responsibility}, journal = {Cognition}, year = {2018}, month = {08/2018}, author = {Tobias Gerstenberg and Ullman, Tomer D. and Nagel, Jonas and Max Kleiman-Weiner and D. A. Lagnado and Joshua B. Tenenbaum} } @article {3685, title = {MEG action recognition data}, year = {2018}, abstract = {

MEG action recognition data from Isik et al., 2018 and Tacchetti et al., 2017. In binned format to be used with the Neural Decoding Toolbox (2018-02-13).

Associated publications:

L. Isik,\ Tacchetti, A., and\ Poggio, T.,\ {\textquotedblleft}A fast, invariant representation for human action in the visual system{\textquotedblright},\ Journal of Neurophysiology, 2018.
A. Tacchetti,\ Isik, L., and\ Poggio, T.,\ {\textquotedblleft}Invariant recognition drives neural representations of action sequences{\textquotedblright},\ PLoS Comp. Bio, 2017.
}, doi = {https://doi.org/10.7910/DVN/KFYY2M}, url = {https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/KFYY2M}, author = {Leyla Isik and Andrea Tacchetti} } @article {4200, title = {Mental labour}, journal = {Nature Human Behaviour}, volume = {2}, year = {2018}, month = {12/2018}, pages = {899 - 908}, abstract = {

Mental effort is an elementary notion in our folk psychology and a familiar fixture in everyday introspective experience. However, as an object of scientific study, mental effort has remained rather elusive. Cognitive psychology has provided some tools for understanding how effort impacts performance, by linking effort with cognitive control function. What has remained less clear are the principles that govern the allocation of mental effort. Under what circumstances do people choose to invest mental effort, and when do they decline to do so? And what regulates the intensity of mental effort when it is applied? In new and promising work, these questions are being approached with the tools of behavioural economics. Though still in its infancy, this economic approach to mental effort research has already uncovered important aspects of effort-based decision-making, and points clearly to future lines of inquiry, including some intriguing opportunities presented by recent artificial intelligence research.

}, doi = {10.1038/s41562-018-0401-9}, url = {http://www.nature.com/articles/s41562-018-0401-9}, author = {Kool, Wouter and Botvinick, Matthew} } @article {4100, title = {Minimal memory for details in real life events}, journal = {Scientific Reports}, volume = {8}, year = {2018}, month = {Jan-12-2018}, abstract = {

The extent to which the details of past experiences are retained or forgotten remains controversial. Some studies suggest massive storage while others describe memories as fallible summary recreations of original events. The discrepancy can be ascribed to the content of memories and how memories are evaluated. Many studies have focused on recalling lists of words/pictures, which lack the critical ingredients of real world memories. Here we quantified the ability to remember details about one hour of real life. We recorded video and eye movements while subjects walked along specified routes and evaluated whether they could distinguish video clips from their own experience from foils. Subjects were minimally above chance in remembering the minutiae of their experiences. Recognition of specific events could be partly explained by a machine-learning model of video contents. These results quantify recognition memory for events in real life and show that the details of everyday experience are largely not retained in memory.

}, doi = {10.1038/s41598-018-33792-2}, url = {https://www.nature.com/articles/s41598-018-33792-2}, author = {Pranav Misra and Marconi, Alyssa and M.F. Peterson and Gabriel Kreiman} } @article {4197, title = {A Minimal Turing Test}, journal = {Journal of Experimental Social Psychology}, volume = {79}, year = {2018}, month = {11/2018}, pages = {1 - 8}, abstract = {

We introduce the Minimal Turing Test, an experimental paradigm for studying perceptions and meta-perceptions of different social groups or kinds of agents, in which participants must use a single word to convince a judge of their identity. We illustrate the paradigm by having participants act as contestants or judges in a Minimal Turing Test in which contestants must convince a judge they are a human, rather than an artificial intelligence. We embed the production data from such a large-scale Minimal Turing Test in a semantic vector space, and construct an ordering over pairwise evaluations from judges. This allows us to identify the semantic structure in the words that people give, and to obtain quantitative measures of the importance that people place on different attributes. Ratings from independent coders of the production data provide additional evidence for the agency and experience dimensions discovered in previous work on mind perception. We use the theory of Rational Speech Acts as a framework for interpreting the behavior of contestants and judges in the Minimal Turing Test.

}, keywords = {Meta-stereotypes, Mind perception, Natural language processing, Stereotypes, Turing Test}, issn = {00221031}, doi = {10.1016/j.jesp.2018.05.007}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0022103117303980}, author = {John P. McCoy and Ullman, Tomer D.} } @inbook {3489, title = {A Natural Language Interface for Mobile Devices}, booktitle = {The Wiley Handbook of Human Computer Interaction}, volume = {2}, year = {2018}, month = {02/2018 }, pages = {539-559}, publisher = {John Wiley \& Sons, }, organization = {John Wiley \& Sons, }, edition = {First}, abstract = {

This chapter discusses some of the primary issues related to the design and construction of natural language interfaces, and in particular, interfaces to mobile devices. It describes two systems in this space: the START information access system and the StartMobile natural language interface to mobile devices. The chapter also discusses recently deployed commercial systems and future directions. The use of natural language annotations, and in particular, parameterized natural language annotations, enables START to respond to user requests in a wide variety of ways. StartMobile uses the START system as a first stage in the processing of user requests. Current commercial systems such as Apple{\textquoteright}s Siri, IBM{\textquoteright}s Watson, Google{\textquoteright}s {\textquotedblleft}Google Now{\textquotedblright}, Microsoft{\textquoteright}s Cortana, and Amazon{\textquoteright}s Alexa employ technology of the sort contained in START and StartMobile in combination with statistical ...

}, doi = {10.1002/9781118976005.ch23}, author = {Boris Katz and Gary Borchardt and Sue Felshin and Federico Mora} } @article {4099, title = {Neural Interactions Underlying Visuomotor Associations in the Human Brain}, journal = {Cerebral Cortex}, volume = {1{\textendash}17}, year = {2018}, month = {12/2018}, abstract = {

Rapid andflexible learning during behavioral choices is critical to our daily endeavors and constitutes a hallmark ofdynamic reasoning. An important paradigm to examineflexible behavior involves learning new arbitrary associationsmapping visual inputs to motor outputs. We conjectured that visuomotor rules are instantiated by translating visual signalsinto actions through dynamic interactions between visual, frontal and motor cortex. We evaluated the neuralrepresentation of such visuomotor rules by performing intracranialfield potential recordings in epilepsy subjects during arule-learning delayed match-to-behavior task. Learning new visuomotor mappings led to the emergence of specificresponses associating visual signals with motor outputs in 3 anatomical clusters in frontal, anteroventral temporal andposterior parietal cortex. After learning, mapping selective signals during the delay period showed interactions with visualand motor signals. These observations provide initial steps towards elucidating the dynamic circuits underlyingflexiblebehavior and how communication between subregions of frontal, temporal, and parietal cortex leads to rapid learning oftask-relevant choices.

}, keywords = {frontal cortex, human neurophysiology, reinforcement learning, visual cortex}, issn = {1047-3211}, doi = {10.1093/cercor/bhy333}, url = {http://klab.tch.harvard.edu/publications/PDFs/gk7766.pdf}, author = {Radhika Madhavan and Bansal, Arjun K and Joseph Madsen and Golby, Alexandra J and Travis S Tierney and Emad Eskandar and WS Anderson and Gabriel Kreiman} } @article {4101, title = {A neural network trained to predict future videoframes mimics critical properties of biologicalneuronal responses and perception}, number = {arXiv:1805.10734v2}, year = {2018}, month = {05/2018}, institution = { arXiv | Cornell University}, abstract = {

While deep neural networks take loose inspiration from neuroscience, it is an open question how seriously to take the analogies between artificial deep networks and biological neuronal systems. Interestingly, recent work has shown that deep convolutional neural networks (CNNs) trained on large-scale image recognition tasks can serve as strikingly good models for predicting the responses of neurons in visual cortex to visual stimuli, suggesting that analogies between artificial and biological neural networks may be more than superficial. However, while CNNs capture key properties of the average responses of cortical neurons, they fail to explain other properties of these neurons. For one, CNNs typically require large quantities of labeled input data for training. Our own brains, in contrast, rarely have access to this kind of supervision, so to the extent that representations are similar between CNNs and brains, this similarity must arise via different training paths. In addition, neurons in visual cortex produce complex time-varying responses even to static inputs, and they dynamically tune themselves to temporal regularities in the visual environment. We argue that these differences are clues to fundamental differences between the computations performed in the brain and in deep networks. To begin to close the gap, here we study the emergent properties of a previously-described recurrent generative network that is trained to predict future video frames in a self-supervised manner. Remarkably, the model is able to capture a wide variety of seemingly disparate phenomena observed in visual cortex, ranging from single unit response dynamics to complex perceptual motion illusions. These results suggest potentially deep connections between recurrent predictive neural network models and the brain, providing new leads that can enrich both fields.

}, url = {https://arxiv.org/pdf/1805.10734.pdf}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {3963, title = {Partially Occluded Hands: A challenging new dataset for single-image hand pose estimation}, year = {2018}, month = {12/2018}, abstract = {

Recognizing the pose of hands matters most when hands are interacting with other objects. To understand how well both machines and humans perform on single-image 2D hand-pose reconstruction from RGB images, we collected a challenging dataset of hands interacting with 148 objects. We used a novel methodology that provides the same hand in the same pose both with the object being present and occluding the hand and without the object occluding the hand. Additionally, we collected a wide range of grasps for each object designing the data collection methodology to ensure this diversity. Using this dataset we measured the performance of two state-of-the-art hand-pose recognition methods showing that both are extremely brittle when faced with even light occlusion from an object. This is not evident in previous datasets because they often avoid hand- object occlusions and because they are collected from videos where hands are often between objects and mostly unoccluded. We annotated a subset of the dataset and used that to show that humans are robust with respect to occlusion, and also to characterize human hand perception, the space of grasps that seem to be considered, and the accuracy of reconstructing occluded portions of hands. We expect that such data will be of interest to both the vision community for developing more robust hand-pose algorithms and to the robotic grasp planning community for learning such grasps. The dataset is available at occludedhands.com

}, keywords = {dataset, Partial occlusion, RGB hand-pose reconstruction}, author = {Battushig Myanganbayar and Cristina Mata and Gil Dekel and Katz, Boris and Guy Ben-Yosef and Andrei Barbu} } @conference {3964, title = {Partially Occluded Hands: A challenging new dataset for single-image hand pose estimation}, booktitle = {The 14th Asian Conference on Computer Vision (ACCV 2018)}, year = {2018}, month = {12/2018}, abstract = {

Recognizing the pose of hands matters most when hands are interacting with other objects. To understand how well both machines and humans perform on single-image 2D hand-pose reconstruction from RGB images, we collected a challenging dataset of hands interacting with 148 objects. We used a novel methodology that provides the same hand in the same pose both with the object being present and occluding the hand and without the object occluding the hand. Additionally, we collected a wide range of grasps for each object designing the data collection methodology to ensure this diversity. Using this dataset we measured the performance of two state-of-the-art hand-pose recognition methods showing that both are extremely brittle when faced with even light occlusion from an object. This is not evident in previous datasets because they often avoid hand- object occlusions and because they are collected from videos where hands are often between objects and mostly unoccluded. We annotated a subset of the dataset and used that to show that humans are robust with respect to occlusion, and also to characterize human hand perception, the space of grasps that seem to be considered, and the accuracy of reconstructing occluded portions of hands. We expect that such data will be of interest to both the vision community for developing more robust hand-pose algorithms and to the robotic grasp planning community for learning such grasps. The dataset is available at occludedhands.com

}, keywords = {dataset, Partial occlusion, RGB hand-pose reconstruction}, url = {http://accv2018.net/}, author = {Battushig Myanganbayar and Cristina Mata and Gil Dekel and Boris Katz and Guy Ben-Yosef and Andrei Barbu} } @article {4190, title = {Planning Complexity Registers as a Cost in Metacontrol}, journal = {Journal of Cognitive Neuroscience}, volume = {30}, year = {2018}, month = {10/2018}, pages = {1391 - 1404}, abstract = {

Decision-making algorithms face a basic tradeoff between accuracy and effort (i.e., computational demands). It is widely agreed that humans can choose between multiple decision-making processes that embody different solutions to this tradeoff: Some are computationally cheap but inaccurate, whereas others are computationally expensive but accurate. Recent progress in understanding this tradeoff has been catalyzed by formalizing it in terms of model-free (i.e., habitual) versus model-based (i.e., planning) approaches to reinforcement learning. Intuitively, if two tasks offer the same rewards for accuracy but one of them is much more demanding, we might expect people to rely on habit more in the difficult task: Devoting significant computation to achieve slight marginal accuracy gains would not be "worth it." We test and verify this prediction in a sequential reinforcement learning task. Because our paradigm is amenable to formal analysis, it contributes to the development of a computational model of how people balance the costs and benefits of different decision-making processes in a task-specific manner; in other words, how we decide when hard thinking is worth it.

}, issn = {0898-929X}, doi = {10.1162/jocn_a_01263}, url = {https://www.mitpressjournals.org/doi/abs/10.1162/jocn_a_01263}, author = {Kool, Wouter and Samuel J Gershman and Fiery A Cushman} } @article {3622, title = {Rational inference of beliefs and desires from emotional expressions}, journal = {Cognitive Science}, volume = {42}, year = {2018}, month = {04/2018}, chapter = {850-884}, abstract = {

We investigated people{\textquoteright}s ability to infer others{\textquoteright} mental states from their emotional reactions, manipulating whether agents wanted, expected, and caused an outcome. Participants recovered agents{\textquoteright} desires throughout. When the agent observed, but did not cause the outcome, participants{\textquoteright} ability to recover the agent{\textquoteright}s beliefs depended on the evidence they got (i.e., her reaction only to the actual outcome or to both the expected and actual outcomes; Experiments 1 and 2). When the agent caused the event, participants{\textquoteright} judgments also depended on the probability of the action (Experiments 3 and 4); when actions were improbable given the mental states, people failed to recover the agent{\textquoteright}s beliefs even when they saw her react to both the anticipated and actual outcomes. A Bayesian model captured human performance throughout (rs >= .95), consistent with the proposal that people rationally integrate information about others{\textquoteright} actions and emotional reactions to infer their unobservable mental states.

}, author = {Wu, Yang and Chris Baker and Joshua B. Tenenbaum and Laura Schulz} } @article {3990, title = {Real-Time Readout of Large-Scale Unsorted Neural Ensemble Place Codes}, journal = {Cell Reports}, volume = {25}, year = {2018}, month = {Jan-12-2018}, pages = {2635 - 2642.e5}, abstract = {

Uncovering spatial representations from large-scale ensemble spike activity in specific brain circuits provides valuable feedback in closed-loop experiments. We develop a graphics processing unit (GPU)-powered population-decoding system for ultrafast reconstruction of spatial positions from rodents{\textquoteright} unsorted spatiotemporal spiking patterns, during run behavior or sleep. In comparison with an optimized quad-core central processing unit (CPU) implementation, our approach achieves an \~{}20- to 50-fold increase in speed in eight tested rat hippocampal, cortical, and thalamic ensemble recordings, with real-time decoding speed (approximately fraction of a millisecond per spike) and scalability up to thousands of channels. By accommodating parallel shuffling in real time (computation time \<15 ms), our approach enables assessment of the statistical significance of online-decoded {\textquotedblleft}memory replay{\textquotedblright} candidates during quiet wakefulness or sleep. This open-source software toolkit supports the decoding of spatial correlates or content-triggered experimental manipulation in closed-loop neuroscience experiments.

}, keywords = {GPU, memory replay, neural decoding, place codes, population decoding, spatiotemporal patterns}, issn = {22111247}, doi = {https://doi.org/10.1016/j.celrep.2018.11.033}, url = {https://www.sciencedirect.com/science/article/pii/S2211124718317960}, author = {Hu, Sile and Ciliberti, Davide and Grosmark, Andres D. and Michon, {\'e}d{\'e}ric and Ji, Daoyun and Hector Penagos and {\'a}ki, {\"o}rgy and Matthew A. Wilson and Kloosterman, Fabian and Chen, Zhe} } @article {3764, title = {Recurrent computations for visual pattern completion}, journal = {Proceedings of the National Academy of Sciences}, year = {2018}, month = {08/2018}, abstract = {

Making inferences from partial information constitutes a critical aspect of cognition. During visual perception, pattern completion enables recognition of poorly visible or occluded objects. We combined psychophysics, physiology, and computational models to test the hypothesis that pattern completion is implemented by recurrent computations and present three pieces of evidence that are consistent with this hypothesis. First, subjects robustly recognized objects even when they were rendered \<15\% visible, but recognition was largely impaired when processing was interrupted by backward masking. Second, invasive physiological responses along the human ventral cortex exhibited visually selective responses to partially visible objects that were delayed compared with whole objects, suggesting the need for additional computations. These physiological delays were correlated with the effects of backward masking. Third, state-of-the-art feed-forward computational architectures were not robust to partial visibility. However, recognition performance was recovered when the model was augmented with attractor-based recurrent connectivity. The recurrent model was able to predict which images of heavily occluded objects were easier or harder for humans to recognize, could capture the effect of introducing a backward mask on recognition behavior, and was consistent with the physiological delays along the human ventral visual stream. These results provide a strong argument of plausibility for the role of recurrent computations in making visual inferences from partial information.

}, keywords = {Artificial Intelligence, computational neuroscience, Machine Learning, pattern completion, Visual object recognition}, issn = {0027-8424}, doi = {10.1073/pnas.1719397115}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1719397115}, author = {Hanlin Tang and Martin Schrimpf and William Lotter and Moerman, Charlotte and Paredes, Ana and Ortega Caro, Josue and Hardesty, Walter and David Cox and Gabriel Kreiman} } @article {3629, title = {Recurrent Multimodal Interaction for Referring Image Segmentation}, year = {2018}, month = {05/2018}, abstract = {

In this paper we are interested in the problem of image segmentation given natural language descriptions, i.e. referring expressions. Existing works tackle this problem by first modeling images and sentences independently and then segment images by combining these two types of representations. We argue that learning word-to-image interaction is more native in the sense of jointly modeling two modalities for the image segmentation task, and we propose convolutional multimodal LSTM to encode the sequential interactions between individual words, visual information, and spatial information. We show that our proposed model outperforms the baseline model on benchmark datasets. In addition, we analyze the intermediate output of the proposed multimodal LSTM approach and empirically explain how this approach enforces a more effective word-to-image interaction.

}, author = {Chenxi Liu and Zhe Lin and Xiaohui Shen and Jimei Yang and Xin Lu and Alan Yuille} } @article {4182, title = {Relational inductive bias for physical construction in humans and machines}, year = {2018}, month = {06/2018}, abstract = {

While current deep learning systems excel at tasks such as object classification, language processing, and gameplay, few can construct or modify a complex system such as a tower of blocks. We hypothesize that what these systems lack is a "relational inductive bias": a capacity for reasoning about inter-object relations and making choices over a structured description of a scene. To test this hypothesis, we focus on a task that involves gluing pairs of blocks together to stabilize a tower, and quantify how well humans perform. We then introduce a deep reinforcement learning agent which uses object- and relation-centric scene and policy representations and apply it to the task. Our results show that these structured representations allow the agent to outperform both humans and more naive approaches, suggesting that relational inductive bias is an important component in solving structured reasoning problems and for building more intelligent, flexible machines.

}, author = {Jessica B. Hamrick and Kelsey Allen and Victor Bapst and Tina Zhu and Kevin R. McKee and Joshua B. Tenenbaum and Battaglia, Peter} } @article {3630, title = {Scene Graph Parsing as Dependency Parsing}, year = {2018}, month = {05/2018}, abstract = {

In this paper, we study the problem of parsing structured knowledge graphs from textual descrip- tions. In particular, we consider the scene graph representation that considers objects together with their attributes and relations: this representation has been proved useful across a variety of vision and language applications. We begin by introducing an alternative but equivalent edge-centric view of scene graphs that connect to dependency parses. Together with a careful redesign of label and action space, we combine the two-stage pipeline used in prior work (generic dependency parsing followed by simple post-processing) into one, enabling end-to-end training. The scene graphs generated by our learned neural dependency parser achieve an F-score similarity of 49.67\% to ground truth graphs on our evaluation set, surpassing best previous approaches by 5\%. We further demonstrate the effective- ness of our learned parser on image retrieval applications.

}, author = {Yu-Siang Wang and Chenxi Liu and Xiaohui Zeng and Alan Yuille} } @article {4195, title = {Searching for visual features that explain response variance of face neurons in inferior temporal cortex}, journal = {PLOS ONE}, volume = {13}, year = {2018}, month = {09/2019}, pages = {e0201192}, abstract = {

Despite a large body of research on response properties of neurons in the inferior temporal (IT) cortex, studies to date have not yet produced quantitative feature descriptions that can predict responses to arbitrary objects. This deficit in the research prevents a thorough understanding of object representation in the IT cortex. Here we propose a fragment-based approach for finding quantitative feature descriptions of face neurons in the IT cortex. The development of the proposed method was driven by the assumption that it is possible to recover features from a set of natural image fragments if the set is sufficiently large. To find the feature from the set, we compared object responses predicted from each fragment and responses of neurons to these objects, and search for the fragment that revealed the highest correlation with neural object responses. Prediction of object responses of each fragment was made by normalizing Euclidian distance between the fragment and each object to 0 to 1 such that the smaller distance gives the higher value. The distance was calculated at the space where images were transformed to a local orientation space by a Gabor filter and a local max operation. The method allowed us to find features with a correlation coefficient between predicted and neural responses of 0.68 on average (number of object stimuli, 104) from among 560,000 feature candidates, reliably explaining differential responses among faces as well as a general preference for faces over to non-face objects. Furthermore, predicted responses of the resulting features to novel object images were significantly correlated with neural responses to these images. Identification of features comprising specific, moderately complex combinations of local orientations and colors enabled us to predict responses to upright and inverted faces, which provided a possible mechanism of face inversion effects.

}, doi = {10.1371/journal.pone.020119210.1371}, url = {http://dx.plos.org/10.1371/journal.pone.0201192}, author = {Owaki, Takashi and Vidal-Naquet, Michel and Nam, Yunjun and Uchida, Go and Sato, Takayuki and C{\^a}teau, Hideyuki and Shimon Ullman and Tanifuji, Manabu}, editor = {Nishijo, Hisao} } @conference {4320, title = {Shared gene co-expression networks in autism from induced pluripotent stem cell (iPSC) neurons}, booktitle = {BioRxiv}, year = {2018}, month = {6/19/2018}, abstract = {

Autism Spectrum Conditions (henceforth, autism) are a diverse set of neurodevelopmental phenotypes with a complex genetic basis. Idiopathic autism, characterized by a diagnosis of autism not caused by a known genetic variant, is associated with hundreds of rare and common genetic variants each of small effect. Functional genomics analyses of post mortem brain tissue have identified convergent atypical gene correlation networks in idiopathic autism. However, post mortem tissue is difficult to obtain and is susceptible to unknown confounding factors related to the cause of death and to storage conditions. To circumvent these limitations, we created induced pluripotent stem cells (iPSCs) from hair follicles of idiopathic autistic individuals and made iPSC-derived neurons, to investigate its usefulness as a substitute for post mortem brain tissue. Plucking hair follicles is a relatively painless and ethical procedure, and hair samples can be obtained from anyone. Functional genomics analyses were used as a replicable analysis pipeline to assess efficacy of iPSC-derived neurons. Gene expression networks, previously identified in adult autism brains, were atypical in the iPSC autism neural cultures in this study. These included those associated with neuronal maturation, synaptic maturation, immune response and inflammation, and gene regulatory mechanisms. In addition, GABRA4, HTR7, ROBO1 and SLITRK5 were atypically expressed among genes previously associated with autism. A drawback of this study was its small sample size, reflecting practical challenges in generating iPSCs from patient cohorts. We conclude that, using rigorous functional genomics analyses, atypical molecular processes seen in the adult autistic postmortem brain can be modelled in hair follicle iPSC-derived neurons. There is thus potential for scaling up of autism transcriptome studies using an iPSC-based model system.

}, doi = {10.1101/349415 }, author = {Adhya, D. and Swarup, V. and Nowosaid, P. and Shum, C. and K.M. Jozwik and McAlonan, G. and Mendez, M.A. and Horder, J. and Murphy, D. and Geschwind, D.H and Price, J. and Carroll, J. and Srivastava, D.P. and Baron-Cohen, S.} } @article {3881, title = {Single units in a deep neural network functionally correspond with neurons in the brain: preliminary results}, year = {2018}, month = {11/2018}, abstract = {

Deep neural networks have been shown to predict neural responses in higher visual cortex. The mapping from the model to a neuron in the brain occurs through a linear combination of many units in the model, leaving open the question of whether there also exists a correspondence at the level of individual neurons. Here we show that there exist many one-to-one mappings between single units in a deep neural network model and neurons in the brain. We show that this correspondence at the single- unit level is ubiquitous among state-of-the-art deep neural networks, and grows more pronounced for models with higher performance on a large-scale visual recognition task. Comparing matched populations{\textemdash}in the brain and in a model{\textemdash}we demonstrate a further correspondence at the level of the population code: stimulus category can be partially decoded from real neural responses using a classifier trained purely on a matched population of artificial units in a model. This provides a new point of investigation for phenomena which require fine-grained mappings between deep neural networks and the brain.

}, author = {Luke Arend and Yena Han and Martin Schrimpf and Pouya Bashivan and Kohitij Kar and Tomaso Poggio and James J. DiCarlo and Xavier Boix} } @conference {3549, title = {Single-Shot Object Detection with Enriched Semantics}, booktitle = {Conference on Computer Vision and Pattern Recognition (CVPR)}, year = {2018}, month = {06/2018}, address = {Salt Lake City, Utah}, abstract = {

We propose a novel single shot object detection network named Detection with Enriched Semantics (DES). Our motivation is to enrich the semantics of object detection features within a typical deep detector, by a semantic segmentation branch and a global activation module. The segmentation branch is supervised by weak segmentation ground-truth, i.e., no extra annotation is required. In conjunction with that, we employ a global activation module which learns relationship between channels and object classes in a self-supervised manner. Comprehensive experimental results on both PASCAL VOC and MS COCO detection datasets demonstrate the effectiveness of the proposed method. In particular, with a VGG16 based DES, we achieve an mAP of 81.7 on VOC2007 test and an mAP of 32.8 on COCO test-dev with an inference speed of 31.5 milliseconds per image on a Titan Xp GPU. With a lower resolution version, we achieve an mAP of 79.7 on VOC2007 with an inference speed of 13.0 milliseconds per image.

}, url = {http://cvpr2018.thecvf.com/}, author = {Zhishuai Zhang and Siyuan Qiao and Cihang Xie and Wei Shen and Bo Wang and Alan Yuille} } @article {3509, title = {Single-Shot Object Detection with Enriched Semantics}, year = {2018}, month = {06/2018}, abstract = {

We propose a novel single shot object detection network named Detection with Enriched Semantics (DES). Our motivation is to enrich the semantics of object detection features within a typical deep detector, by a semantic segmentation branch and a global activation module. The segmentation branch is supervised by weak segmentation ground-truth, i.e., no extra annotation is required. In conjunction with that, we employ a global activation module which learns relationship between channels and object classes in a self-supervised manner. Comprehensive experimental results on both PASCAL VOC and MS COCO detection datasets demonstrate the effectiveness of the proposed method. In particular, with a VGG16 based DES, we achieve an mAP of 81.7 on VOC2007 test and an mAP of 32.8 on COCO test-dev with an inference speed of 31.5 milliseconds per image on a Titan Xp GPU. With a lower resolution version, we achieve an mAP of 79.7 on VOC2007 with an inference speed of 13.0 milliseconds per image.

}, author = {Zhishuai Zhang and Siyuan Qiao and Cihang Xie and Wei Shen and Bo Wang and Alan Yuille} } @article {3904, title = {Spatiotemporal interpretation features in the recognition of dynamic images}, year = {2018}, month = {11/2018}, abstract = {

Objects and their parts can be visually recognized and localized from purely spatial information in static images and also from purely temporal information as in the perception of biological motion. Cortical regions have been identified, which appear to specialize in visual recognition based on either static or dynamic cues, but the mechanisms by which spatial and temporal information is integrated is only poorly understood. Here we show that visual recognition of objects and actions can be achieved by efficiently combining spatial and motion cues in configurations where each source on its own is insufficient for recognition. This analysis is obtained by the identification of minimal spatiotemporal configurations: these are short videos in which objects and their parts, along with an action being performed, can be reliably recognized, but any reduction in either space or time makes them unrecognizable. State-of-the-art computational models for recognition from dynamic images based on deep 2D and 3D convolutional networks cannot replicate human recognition in these configurations. Action recognition in minimal spatiotemporal configurations is invariably accompanied by full human interpretation of the internal components of the image and their inter-relations. We hypothesize that this gap is due to mechanisms for full spatiotemporal interpretation process, which in human vision is an integral part of recognizing dynamic event, but is not sufficiently represented in current DNNs.

}, author = {Guy Ben-Yosef and Gabriel Kreiman and Shimon Ullman} } @article {4192, title = {The statistical shape of geometric reasoning}, journal = {Scientific Reports}, volume = {8}, year = {2018}, month = {08/2018}, abstract = {

Geometric reasoning has an inherent dissonance: its abstract axioms and propositions refer to perfect, idealized entities, whereas its use in the physical world relies on dynamic perception of objects. How do abstract Euclidean concepts, dynamics, and statistics come together to support our intuitive geometric reasoning? Here, we address this question using a simple geometric task {\textendash} planar triangle completion. An analysis of the distribution of participants{\textquoteright} errors in localizing a fragmented triangle{\textquoteright}s missing corner reveals scale-dependent deviations from a deterministic Euclidean representation of planar triangles. By considering the statistical physics of the process characterized via a correlated random walk with a natural length scale, we explain these results and further predict participants{\textquoteright} estimates of the missing angle, measured in a second task. Our model also predicts the results of a categorical reasoning task about changes in the triangle size and shape even when such completion strategies need not be invoked. Taken together, our findings suggest a critical role for noisy physical processes in our reasoning about elementary Euclidean geometry.

}, doi = {10.1038/s41598-018-30314-y}, url = {http://www.nature.com/articles/s41598-018-30314-y}, author = {Hart, Yuval and Moira R Dillon and Andrew Marantan and Cardenas, Anna L. and Elizabeth S Spelke and Mahadevan, L.} } @mastersthesis {4189, title = {Structured learning and inference with neural networks and generative models}, year = {2018}, author = {Owen Lewis} } @article {3573, title = {A task-optimized neural network replicates human auditory behavior, predicts brain responses, and reveals a cortical processing hierarchy}, journal = {Neuron}, volume = {98}, year = {2018}, month = {04/2018}, abstract = {

A core goal of auditory neuroscience is to build quantitative models that predict cortical responses to natural sounds. Reasoning that a complete model of auditory cortex must solve ecologically relevant tasks, we optimized hierarchical neural networks for speech and music recognition. The best-performing network contained separate music and speech pathways following early shared processing, potentially replicating human cortical organization. The network performed both tasks as well as humans and exhibited human-like errors despite not being optimized to do so, suggesting common constraints on network and human performance. The network predicted fMRI voxel responses substantially better than traditional spectrotemporal filter models throughout auditory cortex. It also provided a quantitative signature of cortical representational hierarchy{\textemdash}primary and non-primary responses were best predicted by intermediate and late network layers, respectively. The results suggest that task optimization provides a powerful set of tools for modeling sensory systems.

}, keywords = {auditory cortex, convolutional neural network, deep learning, deep neural network, encoding models, fMRI, Hierarchy, human auditory cortex, natural sounds, word recognition}, doi = {10.1016/j.neuron.2018.03.044}, url = {https://www.sciencedirect.com/science/article/pii/S0896627318302502}, author = {Alexander J. E. Kell and Daniel L K Yamins and Erica N Shook and Sam V Norman-Haignere and Josh H. McDermott} } @article {4185, title = {Theory I: Deep networks and the curse of dimensionality}, journal = {Bulletin of the Polish Academy of Sciences: Technical Sciences}, volume = {66}, year = {2018}, abstract = {

We review recent work characterizing the classes of functions for which deep learning can be exponentially better than shallow learning. Deep convolutional networks are a special case of these conditions, though weight sharing is not the main reason for their exponential advantage.

}, keywords = {convolutional neural networks, deep and shallow networks, deep learning, function approximation}, author = {Tomaso Poggio and Qianli Liao} } @article {4186, title = {Theory II: Deep learning and optimization}, journal = {Bulletin of the Polish Academy of Sciences: Technical Sciences}, volume = {66}, year = {2018}, abstract = {

The landscape of the empirical risk of overparametrized deep convolutional neural networks (DCNNs) is characterized with a mix of theory and experiments. In part A we show the existence of a large number of global minimizers with zero empirical error (modulo inconsistent equations). The argument which relies on the use of Bezout theorem is rigorous when the RELUs are replaced by a polynomial nonlinearity. We show with simulations that the corresponding polynomial network is indistinguishable from the RELU network. According to Bezout theorem, the global minimizers are degenerate unlike the local minima which in general should be non-degenerate. Further we experimentally analyzed and visualized the landscape of empirical risk of DCNNs on CIFAR-10 dataset. Based on above theoretical and experimental observations, we propose a simple model of the landscape of empirical risk. In part B, we characterize the optimization properties of stochastic gradient descent applied to deep networks. The main claim here consists of theoretical and experimental evidence for the following property of SGD: SGD concentrates in probability {\textendash} like the classical Langevin equation {\textendash} on large volume, {\textquotedblright}flat{\textquotedblright} minima, selecting with high probability degenerate minimizers which are typically global minimizers.

}, doi = {10.24425/bpas.2018.125925}, author = {Tomaso Poggio and Qianli Liao} } @article {3694, title = {Theory III: Dynamics and Generalization in Deep Networks}, year = {2018}, month = {06/2018}, abstract = {

The key to generalization is controlling the complexity of
\  \  \  \  \  \  the network. However, there is no obvious control of
\  \  \  \  \  \  complexity -- such as an explicit regularization term --
\  \  \  \  \  \  in the training of deep networks for classification. We
\  \  \  \  \  \  will show that a classical form of norm control -- but
\  \  \  \  \  \  kind of hidden -- is present in deep networks trained with
\  \  \  \  \  \  gradient descent techniques on exponential-type losses. In
\  \  \  \  \  \  particular, gradient descent induces a dynamics of the
\  \  \  \  \  \  normalized weights which converge for $t \to \infty$ to an
\  \  \  \  \  \  equilibrium which corresponds to a minimum norm (or
\  \  \  \  \  \  maximum margin) solution. For sufficiently large but
\  \  \  \  \  \  finite $\rho$ -- and thus finite $t$ -- the dynamics
\  \  \  \  \  \  converges to one of several margin maximizers, with the
\  \  \  \  \  \  margin monotonically increasing towards a limit stationary
\  \  \  \  \  \  point of the flow. In the usual case of stochastic
\  \  \  \  \  \  gradient descent, most of the stationary points are likely
\  \  \  \  \  \  to be convex minima corresponding to a regularized,
\  \  \  \  \  \  constrained minimizer -- the network with normalized
\  \  \  \  \  \  weights-- which is stable and has asymptotic zero
\  \  \  \  \  \  generalization gap, asymptotically for $N \to \infty$,
\  \  \  \  \  \  where $N$ is the number of training examples. For finite,
\  \  \  \  \  \  fixed $N$ the generalizaton gap may not be zero but the
\  \  \  \  \  \  minimum norm property of the solution can provide, we
\  \  \  \  \  \  conjecture, good expected performance for suitable data
\  \  \  \  \  \  distributions. Our approach extends some of the results of
\  \  \  \  \  \  Srebro from linear networks to deep networks and provides
\  \  \  \  \  \  a new perspective on the implicit bias of gradient
\  \  \  \  \  \  descent. We believe that the elusive complexity control we
\  \  \  \  \  \  describe is responsible for the puzzling empirical finding
\  \  \  \  \  \  of good predictive performance by deep networks, despite
\  \  \  \  \  \  overparametrization.\ 

}, author = {Andrzej Banburski and Qianli Liao and Brando Miranda and Tomaso Poggio and Lorenzo Rosasco and Jack Hidary and Fernanda De La Torre} } @article {4193, title = {Third-Party Preferences for Imitators in Preverbal Infants}, journal = {Open Mind}, volume = {2}, year = {2018}, month = {12/2018}, pages = {61 - 71}, abstract = {

Participants in social interactions often imitate one another, thereby enhancing their affiliation. Here we probe the nature and early development of imitation-based affiliation through studies of infants{\textquoteright} preferences for animated characters who imitate, or are imitated by, other characters. Four experiments provide evidence that preverbal infants preferentially attend to and approach individuals who imitate others. This preferential engagement is elicited by the elements of mimicry in simple acts of helping. It does not, however, extend to the targets of imitation in these interactions. This set of findings suggests infants{\textquoteright} imitation-based preferences are not well explained by homophily, prestige, or familiarity. We propose instead that infants perceive imitation as an indicator of valuable attributes in a potential social partner, including the capacity and motivation for social attention and coordinated action.

}, keywords = {imitation, infancy, social cognition}, doi = {10.1162/opmi_a_00018}, url = {https://www.mitpressjournals.org/doi/abs/10.1162/opmi_a_00018}, author = {Lindsey J Powell and Elizabeth S Spelke} } @conference {3999, title = {Trading robust representations for sample complexity through self-supervised visual experience}, booktitle = {Advances in Neural Information Processing Systems 31}, year = {2018}, month = {12/2018}, pages = {9640{\textendash}9650}, publisher = {Curran Associates, Inc.}, organization = {Curran Associates, Inc.}, address = {Montreal, Canada}, abstract = {

Learning in small sample regimes is among the most remarkable features of the human perceptual system. This ability is related to robustness to transformations, which is acquired through visual experience in the form of weak- or self-supervision during development. We explore the idea of allowing artificial systems to learn representations of visual stimuli through weak supervision prior to downstream su- pervised tasks. We introduce a novel loss function for representation learning using unlabeled image sets and video sequences, and experimentally demonstrate that these representations support one-shot learning and reduce the sample complexity of multiple recognition tasks. We establish the existence of a trade-off between the sizes of weakly supervised, automatically obtained from video sequences, and fully supervised data sets. Our results suggest that equivalence sets other than class labels, which are abundant in unlabeled visual experience, can be used for self-supervised learning of semantically relevant image embeddings.

}, url = {http://papers.nips.cc/paper/8170-trading-robust-representations-for-sample-complexity-through-self-supervised-visual-experience.pdf}, author = {Tacchetti, Andrea and Stephen Voinea and Evangelopoulos, Georgios}, editor = {S. Bengio and H. Wallach and H. Larochelle and K. Grauman and N. Cesa-Bianchi and R. Garnett} } @article {3545, title = {Visual Concepts and Compositional Voting}, journal = {Annals of Mathematical Sciences and Applications (AMSA)}, volume = {3}, year = {2018}, pages = {151{\textendash}188}, abstract = {

It is very attractive to formulate vision in terms of pattern theory \cite{Mumford2010pattern}, where patterns are defined hierarchically by compositions of elementary building blocks. But applying pattern theory to real world images is currently less successful than discriminative methods such as deep networks. Deep networks, however, are black-boxes which are hard to interpret and can easily be fooled by adding occluding objects. It is natural to wonder whether by better understanding deep networks we can extract building blocks which can be used to develop pattern theoretic models. This motivates us to study the internal representations of a deep network using vehicle images from the PASCAL3D+ dataset. We use clustering algorithms to study the population activities of the features and extract a set of visual concepts which we show are visually tight and correspond to semantic parts of vehicles. To analyze this we annotate these vehicles by their semantic parts to create a new dataset, VehicleSemanticParts, and evaluate visual concepts as unsupervised part detectors. We show that visual concepts perform fairly well but are outperformed by supervised discriminative methods such as Support Vector Machines (SVM). We next give a more detailed analysis of visual concepts and how they relate to semantic parts. Following this, we use the visual concepts as building blocks for a simple pattern theoretical model, which we call compositional voting. In this model several visual concepts combine to detect semantic parts. We show that this approach is significantly better than discriminative methods like SVM and deep networks trained specifically for semantic part detection. Finally, we return to studying occlusion by creating an annotated dataset with occlusion, called VehicleOcclusion, and show that compositional voting outperforms even deep networks when the amount of occlusion becomes large.

}, keywords = {deep networks, pattern theory, visual concepts}, doi = {10.4310/AMSA.2018.v3.n1.a5}, url = {http://www.intlpress.com/site/pub/pages/journals/items/amsa/content/vols/0003/0001/a005/index.html}, author = {Jianyu Wang and Zhishuai Zhang and Cihang Xie and Yuyin Zhou and Vittal Premachandran and Jun Zhu and Lingxi Xie and Alan Yuille} } @article {3594, title = {Visual concepts and compositional voting}, year = {2018}, month = {03/2018}, abstract = {

It is very attractive to formulate vision in terms of pattern theory [26], where patterns are defined hierarchically by compositions of elementary building blocks. But applying pattern theory to real world images is very challenging and is currently less successful than discriminative methods such as deep networks. Deep networks, however, are black-boxes which are hard to interpret and, as we will show, can easily be fooled by adding occluding objects. It is natural to wonder whether by better under- standing deep networks we can extract building blocks which can be used to develop pattern theoretic models. This motivates us to study the internal feature vectors of a deep network using images of vehicles from the PASCAL3D+ dataset with the scale of objects fixed. We use clustering algorithms, such as K-means, to study the population activity of the features and extract a set of visual concepts which we show are visually tight and correspond to semantic parts of the vehicles. To analyze this in more detail, we annotate these vehicles by their semantic parts to create a new dataset which we call VehicleSemanticParts, and evaluate visual concepts as unsupervised semantic part detectors. Our results show that visual concepts perform fairly well but are outperformed by supervised discriminative methods such as Support Vector Machines. We next give a more detailed analysis of visual concepts and how they relate to semantic parts. Following this analysis, we use the visual concepts as building blocks for a simple pattern theoretical model, which we call compositional voting. In this model several visual concepts combine to detect semantic parts. We show that this approach is significantly better than discriminative methods like Support Vector machines and deep networks trained specifically for semantic part detection. Finally, we return to studying occlusion by creating an annotated dataset with occlusion, called Vehicle Occlusion, and show that compositional voting outperforms even deep networks when the amount of occlusion becomes large.

}, author = {Jianyu Wang and Zhishuai Zhang and Cihang Xie and Yuyin Zhou and Vittal Premachandran and Jun Zhu and Lingxi Xie and Alan Yuille} } @article {3959, title = {What am I searching for?}, year = {2018}, month = {07/2018}, abstract = {

Can we infer intentions and goals from a person{\textquoteright}s actions? As an example of this family of problems, we consider here whether it is possible to decipher what a person is searching for by decoding their eye movement behavior. We conducted two human psychophysics experiments on object arrays and natural images where we monitored subjects{\textquoteright} eye movements while they were looking for a target object. Using as input the pattern of "error" fixations on non-target objects before the target was found, we developed a model (InferNet) whose goal was to infer what the target was. "Error" fixations share similar features with the sought target. The Infernet model uses a pre-trained 2D convolutional architecture to extract features from the error fixations and computes a 2D similarity map between the error fixation and all locations across the search image by modulating the search image via convolution across layers. InferNet consolidates the modulated response maps across layers via max pooling to keep track of the sub-patterns highly similar to features at error fixations and integrates these maps across all error fixations. InferNet successfully identifies the subject{\textquoteright}s goal and outperforms all the competitive null models, even without any object-specific training on the inference task.

}, author = {Zhang, Mengmi and Feng, Jiashi and Lim, Joo Hwee and Qi Zhao and Gabriel Kreiman} } @article {3517, title = {What is changing when: decoding visual information in movies from human intracranial recordings}, journal = {NeuroImage}, volume = {180, Part A}, year = {2018}, month = {10/2018}, pages = {147-159}, abstract = {

The majority of visual recognition studies have focused on the neural responses to repeated presentations of static stimuli with abrupt and well-defined onset and offset times. In contrast, natural vision involves unique renderings of visual inputs that are continuously changing without explicitly defined temporal transitions. Here we considered commercial movies as a coarse proxy to natural vision. We recorded intracranial field potential signals from 1,284 electrodes implanted in 15 patients with epilepsy while the subjects passively viewed commercial movies. We could rapidly detect large changes in the visual inputs within approximately 100 ms of their occurrence, using exclusively field potential signals from ventral visual cortical areas including the inferior temporal gyrus and inferior occipital gyrus. Furthermore, we could decode the content of those visual changes even in a single movie presentation, generalizing across the wide range of transformations present in a movie. These results present a methodological framework for studying cognition during dynamic and natural vision.

}, keywords = {Electrocorticography (ECoG), Movies, Natural vision, neural decoding, object recognition, Ventral pathway}, doi = {10.1016/j.neuroimage.2017.08.027}, url = {https://www.sciencedirect.com/science/article/pii/S1053811917306742}, author = {Leyla Isik and Jedediah Singer and Nancy Kanwisher and Madsen JR and Anderson WS and Gabriel Kreiman} } @conference {2491, title = {Active Video Summarization: Customized Summaries via On-line Interaction.}, booktitle = {AAAI Conference on Artificial Intelligence}, year = {2017}, author = {Garcia del Molino, A and X Boix and Lim, J. and Tan, A} } @article {2668, title = {Adaptive Compression of Statistically Homogenous Sensory Signals}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @conference {2586, title = {Attention Correctness in Neural Image Captioning}, booktitle = {AAAI 2017}, year = {2017}, abstract = {

Attention mechanisms have recently been introduced in deep learning for various tasks in natural language processing and computer vision. But despite their popularity, the "correctness" of the implicitly-learned attention maps has only been assessed qualitatively by visualization of several examples. In this paper we focus on evaluating and improving the correctness of attention in neural image captioning models. Specifically, we propose a quantitative evaluation metric for the consistency between the generated attention maps and human annotations, using recently released datasets with alignment between regions in images and entities in captions. We then propose novel models with different levels of explicit supervision for learning attention maps during training. The supervision can be strong when alignment between regions and caption entities are available, or weak when only object segments and categories are provided. We show on the popular Flickr30k and COCO datasets that introducing supervision of attention maps during training solidly improves both attention correctness and caption quality, showing the promise of making machine perception more human-like.

}, author = {Chenxi Liu and Junhua Mao and Fei Sha and Alan Yuille} } @article {2749, title = {Auditory Perception of Material and Force from Impact Sounds}, year = {2017}, author = {James Traer and Josh H. McDermott} } @article {3392, title = {A Balanced Comparison of Object Invariances in Monkey IT Neurons}, journal = {eneuro}, volume = {4}, year = {2017}, month = {Jan-04-2018}, pages = {ENEURO.0333-16.2017}, doi = {10.1523/ENEURO.0333-16.2017}, url = {http://eneuro.sfn.org/lookup/doi/10.1523/ENEURO.0333-16.2017https://syndication.highwire.org/content/doi/10.1523/ENEURO.0333-16.2017}, author = {N. Apurva Ratan Murty and Arun, Sripati P.} } @article {3441, title = {Building machines that learn and think like people.}, journal = {Behavioral and Brain Sciences}, volume = {40}, year = {2017}, month = {2017 Jan}, pages = {e253}, abstract = {

Recent progress in artificial intelligence has renewed interest in building systems that learn and think like people. Many advances have come from using deep neural networks trained end-to-end in tasks such as object recognition, video games, and board games, achieving performance that equals or even beats that of humans in some respects. Despite their biological inspiration and performance achievements, these systems differ from human intelligence in crucial ways. We review progress in cognitive science suggesting that truly human-like learning and thinking machines will have to reach beyond current engineering trends in both what they learn and how they learn it. Specifically, we argue that these machines should (1) build causal models of the world that support explanation and understanding, rather than merely solving pattern recognition problems; (2) ground learning in intuitive theories of physics and psychology to support and enrich the knowledge that is learned; and (3) harness compositionality and learning-to-learn to rapidly acquire and generalize knowledge to new tasks and situations. We suggest concrete challenges and promising routes toward these goals that can combine the strengths of recent neural network advances with more structured cognitive models.

}, issn = {1469-1825}, doi = {https://doi.org/10.1017/S0140525X16001837}, url = {https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/article/building-machines-that-learn-and-think-like-people/A9535B1D745A0377E16C590E14B94993/core-reader}, author = {Brenden M Lake and Ullman, Tomer D and Joshua B. Tenenbaum and Samuel J Gershman} } @conference {2822, title = {Causal and compositional generative models in online perception}, booktitle = {39th Annual Conference of the Cognitive Science Society}, year = {2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard pattern recognition approaches to perception, we argue that this mapping draws on internal causal and compositional models of the outside phys- ical world, and that such internal models underlie the general- ization capacity of human perception. Here, we present a gen- erative model of visual and multisensory perception in which the latent variables encode intrinsic properties of objects such as their shapes and surfaces in addition to their extrinsic prop- erties such as pose and occlusion. These latent variables can be composed in novel ways and are inputs to sensory-specific causal models that output sense-specific signals. We present a novel recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not an alternative baseline model or a lesion of our model, can account for hu- man performance in an occluded face matching task and in a cross-modal visual-to-haptic face matching task.\ 

}, author = {Ilker Yildirim and Michael Janner}, editor = {Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {3539, title = {Causal and compositional generative models in online perception}, year = {2017}, month = {07/2017}, address = {London, UK}, abstract = {

From a quick glance or the touch of an object, our brains map sensory signals to scenes composed of rich and detailed shapes and surfaces. Unlike the standard approaches to perception, we argue that this mapping draws on internal causal and compositional models of the physical world and these internal models underlie the generalization capacity of human perception. Here, we present a generative model of visual and multisensory perception in which the latent variables encode intrinsic (e.g., shape) and extrinsic (e.g., occlusion) object properties. Latent variables are inputs to causal models that output sense-specific signals. We present a recognition network that performs efficient inference in the generative model, computing at a speed similar to online perception. We show that our model, but not alternatives, can account for human performance in an occluded face matching task and in a visual-to-haptic face matching task.

}, url = {https://mindmodeling.org/cogsci2017/papers/0266/index.html}, author = {Ilker Yildirim and Michael Janner and Mario Belledonne and Christian Wallraven and W. A. Freiwald and Joshua B. Tenenbaum} } @proceedings {3445, title = {Causal learning from interventions and dynamics in continuous time}, year = {2017}, abstract = {

Event timing and interventions are important and intertwined cues to causal structure, yet they have typically been studied separately. We bring them together for the first time in an ex- periment where participants learn causal structure by performing interventions in continuous time. We contrast learning in acyclic and cyclic devices, with reliable and unreliable cause{\textendash} effect delays. We show that successful learners use interventions to structure and simplify their interactions with the de- vices and that we can capture judgment patterns with heuristics based on online construction and testing of a single structural hypothesis.

}, author = {Neil Bramley and Ralf Mayrhofer and Tobias Gerstenberg and D. A. Lagnado} } @article {2639, title = {A Causal Relationship Between Face-Patch Activity and Face-Detection Behavior}, journal = {eLife}, year = {2017}, month = {04/2017}, abstract = {

The primate brain contains distinct areas densely populated by face-selective neurons. One of these, face-patch ML, contains neurons selective for contrast relationships between face parts. Such contrast-relationships can serve as powerful heuristics for face detection. However, it is unknown whether neurons with such selectivity actually support face-detection behavior. Here, we devised a naturalistic face-detection task and combined it with fMRI-guided pharmacological inactivation of ML to test whether ML is of critical importance for real-world face detection. We found that inactivation of ML impairs face detection. The effect was anatomically specific, as inactivation of areas outside ML did not affect face detection, and it was categorically specific, as inactivation of ML impaired face detection while sparing body and object detection. These results establish that ML function is crucial for detection of faces in natural scenes, performing a critical first step on which other face processing operations can build.

}, keywords = {face patch, fMRI, inactivation, Neuroscience}, doi = {https://doi.org/10.7554/eLife.18558.001}, url = {https://elifesciences.org/articles/18558}, author = {Srivatsun Sadagopan and Wilbert Zarco and W. A. Freiwald} } @article {2736, title = {Changing minds: Children{\textquoteright}s inferences about third party belief revision}, journal = {Developmental Science}, year = {2017}, month = {05/2017}, pages = {e12553}, abstract = {

By the age of five, children explicitly represent that agents can have both true and false beliefs

based on epistemic access to information (e.g., Wellman, Cross, \& Watson, 2001). Children also begin to understand that agents can view identical evidence and draw different inferences from it (e.g., Carpenter \& Chandler, 1996). However, much less is known about when, and under what conditions, children expect other agents to change their minds. Here, inspired by formal ideal observer models of learning, we investigate children{\textquoteright}s expectations of the dynamics that underlie third parties{\textquoteright} belief revision. We introduce an agent who has prior beliefs about the location of a population of toys and then observes evidence that, from an ideal observer perspective, either does, or does not justify revising those beliefs. We show that children{\textquoteright}s inferences on behalf of third parties are consistent with the ideal observer perspective, but not with a number of alternative possibilities, including that children expect other agents to be influenced only by their prior beliefs, only by the sampling process, or only by the observed data. Rather, children integrate all three factors in determining how and when agents will update their beliefs from evidence.\ 

}, keywords = {learning, rational action, theory of mind}, doi = {10.1111/desc.12553}, author = {Rachel Magid and Phyllis Yan and Max Siegel and Joshua B. Tenenbaum and Laura Schulz} } @conference {3636, title = {Character-building stories}, booktitle = {Advances in Cognitive Systems}, year = {2017}, month = {05/2017}, address = {Troy, NY}, abstract = {

We argue that story understanding mechanisms provide a foundation for modeling aspects of our ability to reason hypothetically. We first note that story understanding mechanisms enable us to answer what-if questions about what would happen if an event did or did not occur, and we note that story understanding enables us to answer what-if questions about how a story would be interpreted from a different cultural perspective. We then advance a theory of how humans use hypothetical reasoning to think about personality traits. Our theory and implementation describe how humans use past behavior and untapped alternatives to build a model of characters{\textquoteright} motives and constraints.
We focus on how generalizations of existing story understanding methods and concepts enable us to model this competence efficiently. In a sample story, our theory and implementation perform a complex reasoning process to decide what a character will do next based on whether the character is more like a Conformist, Thief, Opportunist, or Robin Hood archetype.

}, author = {Patrick Henry Winston and Dylan Holmes} } @article {3621, title = {Children understand that agents maximize expected utilities.}, journal = {Journal of Experimental Psychology: General}, volume = {146}, year = {2017}, month = {Jan-11-2017}, pages = {1574 - 1585}, abstract = {

A growing set of studies suggests that our ability to infer, and reason about, mental states is supported by the assumption that agents maximize utilities{\textemdash}the rewards they attain minus the costs they incur. This assumption enables observers to work backward from agents{\textquoteright} observed behavior to their underlying beliefs, preferences, and competencies. Intuitively, however, agents may have incomplete, uncertain, or wrong beliefs about what they want. More formally, agents try to maximize their expected utilities. This understanding is crucial when reasoning about others{\textquoteright} behavior: It dictates when actions reveal preferences, and it makes predictions about the stability of behavior over time. In a set of 7 experiments we show that 4- and 5-year-olds understand that agents try to maximize expected utilities, and that these responses cannot be explained by simpler accounts. In particular, these results suggest a modification to the standard belief/desire model of intuitive psychology. Children do not treat beliefs and desires as independent; rather, they recognize that agents have beliefs about their own desires and that this has consequences for the interpretation of agents{\textquoteright} actions.

}, issn = {0096-3445}, doi = {10.1037/xge0000345}, url = {http://doi.apa.org/getdoi.cfm?doi=10.1037/xge0000345http://psycnet.apa.org/journals/xge/146/11/1574.pdf}, author = {Julian Jara-Ettinger and Floyd, Samantha and Joshua B. Tenenbaum and Laura Schulz} } @article {3537, title = {Comparing human and monkey neural circuits for processing social scenes}, year = {2017}, author = {Julia Sliwa and S. R. Marvel and W. A. Freiwald} } @article {3440, title = {Compositional inductive biases in function learning.}, journal = {Cogn Psychol}, volume = {99}, year = {2017}, month = {2017 Dec}, pages = {44-79}, abstract = {

How do people recognize and learn about complex functional structure? Taking inspiration from other areas of cognitive science, we propose that this is achieved by harnessing compositionality: complex structure is decomposed into simpler building blocks. We formalize this idea within the framework of Bayesian regression using a grammar over Gaussian process kernels, and compare this approach with other structure learning approaches. Participants consistently chose compositional (over non-compositional) extrapolations and interpolations of functions. Experiments designed to elicit priors over functional patterns revealed an inductive bias for compositional structure. Compositional functions were perceived as subjectively more predictable than non-compositional functions, and exhibited other signatures of predictability, such as enhanced memorability and reduced numerosity. Taken together, these results support the view that the human intuitive theory of functions is inherently compositional.

}, issn = {1095-5623}, doi = {10.1016/j.cogpsych.2017.11.002}, url = {https://www.sciencedirect.com/science/article/pii/S0010028517301743?via\%3Dihub}, author = {Eric Schulz and Joshua B. Tenenbaum and David Duvenaud and Maarten Speekenbrink and Samuel J Gershman} } @article {3282, title = {Compression of Deep Neural Networks for Image Instance Retrieval}, year = {2017}, month = {01/2017}, abstract = {

Image instance retrieval is the problem of retrieving images from a database which contain the same object. Convolutional Neural Network (CNN) based descriptors are becoming the dominant approach for generating {\it global image descriptors} for the instance retrieval problem. One major drawback of CNN-based {\it global descriptors} is that uncompressed deep neural network models require hundreds of megabytes of storage making them inconvenient to deploy in mobile applications or in custom hardware. In this work, we study the problem of neural network model compression focusing on the image instance retrieval task. We study quantization, coding, pruning and weight sharing techniques for reducing model size for the instance retrieval problem. We provide extensive experimental results on the trade-off between retrieval performance and model size for different types of networks on several data sets providing the most comprehensive study on this topic. We compress models to the order of a few MBs: two orders of magnitude smaller than the uncompressed models while achieving negligible loss in retrieval performance.

}, url = {https://arxiv.org/abs/1701.04923}, author = {Vijay Chandrasekhar and Jie Lin and Qianli Liao and Olivier Mor{\`e}re and Antoine Veillard and Lingyu Duan and Tomaso Poggio} } @article {3442, title = {Cost-Benefit Arbitration Between Multiple Reinforcement-Learning Systems.}, journal = {Psychol Sci}, volume = {28}, year = {2017}, month = {2017 Sep}, pages = {1321-1333}, abstract = {

Human behavior is sometimes determined by habit and other times by goal-directed planning. Modern reinforcement-learning theories formalize this distinction as a competition between a computationally cheap but inaccurate model-free system that gives rise to habits and a computationally expensive but accurate model-based system that implements planning. It is unclear, however, how people choose to allocate control between these systems. Here, we propose that arbitration occurs by comparing each system{\textquoteright}s task-specific costs and benefits. To investigate this proposal, we conducted two experiments showing that people increase model-based control when it achieves greater accuracy than model-free control, and especially when the rewards of accurate performance are amplified. In contrast, they are insensitive to reward amplification when model-based and model-free control yield equivalent accuracy. This suggests that humans adaptively balance habitual and planned action through on-line cost-benefit analysis.

}, issn = {1467-9280}, doi = {10.1177/0956797617708288}, author = {Kool, Wouter and Samuel J Gershman and Fiery A Cushman} } @article {2534, title = {The cradle of social knowledge: Infants{\textquoteright} reasoning about caregiving and affiliation}, journal = {Cognition}, volume = {159}, year = {2017}, month = {02/2017}, pages = {102-116}, abstract = {

Considerable research has examined infants{\textquoteright} understanding and evaluations of social agents, but two questions remain unanswered: First, do infants organize observed social relations into larger structures, inferring the relationship between two social beings based on their relations to a third party? Second, how do infants reason about a type of social relation prominent in all societies: the caregiving relation between parents and their babies? In a series of experiments using animated events, we ask whether 15- to 18-month-old infants infer that two babies who were comforted by the same adult, or two adults who comforted the same baby, will affiliate with one another. We find that infants make both of these inferences, but they make no comparable inferences when presented with the same visible events with voices that specify a peer context, in which one adult responds to another laughing adult. Thus, infants are sensitive to at least one aspect of caregiving and organize relations between infants and adults into larger social structures.

}, keywords = {caregiving, social cognition, social development}, doi = {10.1016/j.cognition.2016.11.008}, author = {A C Spokes and Elizabeth S Spelke} } @proceedings {2605, title = {Critical Cues in Early Physical Reasoning}, year = {2017}, address = {Austin, TX}, author = {Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @conference {2543, title = {A Data Science approach to analyzing neural data}, booktitle = {Joint Statistical Meetings}, year = {2017}, abstract = {

Data Science is a field that uses computational tools to extract insight from large noisy data sets. While Data Science borrows heavily from Statistics (and one could reasonably argue that they are the same field), the culture, approach, and tools used by Data Scientists often differ from those that are more commonly used by Statisticians (Breiman 2001, Donoho, 2015). Additionally, while Data Science approaches are most widely used in industry, scientists in academic fields usually use classical statistical approaches. In this paper we illustrate how a Data Science approach can give useful insights into scientific questions by describing our work using machine learning methods to analyzing neural data. We also outline additional ways in which Neuroscience and other fields could benefit from incorporating more Data Science perspectives into how problems are approached, and areas where Data Science approaches could benefit from more rigorous Statistical methods.\ 

}, author = {Ethan Meyers} } @article {2745, title = {Deciphering neural codes of memory during sleep}, journal = {Trends in Neurosciences}, year = {2017}, author = {Zhe Chen and Matthew A. Wilson} } @article {2863, title = {A Dedicated Network for Social Interaction Processing in the Primate Brain}, journal = {Science}, volume = {Vol. 356}, year = {2017}, month = {05/2017}, pages = {pp. 745-749}, abstract = {

Primate cognition requires interaction processing.\  Interactions can reveal otherwise hidden properties of intentional agents, such as thoughts and feelings, and of inanimate objects, such as mass and material.\  Where and how interaction analyses are implemented in the brain, is unknown.\  Using whole-brain fMRI in macaque monkeys, we discovered a network centered in medial and ventrolateral prefrontal cortex, engaged in social interaction analysis exclusively.\  Exclusivity of specialization was found for no other function anywhere in the brain.\  Two additional networks, a parieto-premotor and a temporal one, exhibited both social and physical interaction preference, which, in the temporal lobe, mapped onto a fine-grain pattern of object, body, and face selectivity.\  Extent and location of a dedicated system for social interaction analysis suggest this function as an evolutionary forerunner of human mind-reading capabilities.

}, doi = {DOI: 10.1126/science.aam6383 }, url = {http://science.sciencemag.org/content/356/6339/745}, author = {J. Sliwa and W. A. Freiwald} } @conference {2676, title = {Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning}, booktitle = {ICLR}, year = {2017}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {2567, title = {Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning}, year = {2017}, month = {03/2017}, abstract = {

While great strides have been made in using deep learning algorithms to solve supervised learning tasks, the problem of unsupervised learning{\textemdash}leveraging unlabeled examples to learn about the structure of a domain {\textemdash} remains a difficult unsolved challenge. Here, we explore prediction of future frames in a video sequence as an unsupervised learning rule for learning about the structure of the visual world. We describe a predictive neural network ({\textquotedblleft}PredNet{\textquotedblright}) architecture that is inspired by the concept of {\textquotedblleft}predictive coding{\textquotedblright} from the neuroscience literature. These networks learn to predict future frames in a video sequence, with each layer in the network making local predictions and only forwarding deviations from those predictions to subsequent network layers. We show that these networks are able to robustly learn to predict the movement of synthetic (rendered) objects, and that in doing so, the networks learn internal representations that are useful for decoding latent object parameters (e.g. pose) that support object recognition with fewer training views. We also show that these networks can scale to complex natural image streams (car-mounted camera videos), capturing key aspects of both egocentric movement and the movement of objects in the visual scene, and the representation learned in this setting is useful for estimating the steering angle. Altogether, these results suggest that prediction represents a powerful framework for unsupervised learning, allowing for implicit learning of object and scene structure.

}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {2806, title = {Design of the Artificial: lessons from the biological roots of general intelligence}, year = {2017}, abstract = {
Our desire and fascination with intelligent machines dates back to the antiquity{\textquoteright}s mythical automaton Talos, Aristotle{\textquoteright}s mode of mechanical thought (syllogism) and Heron of Alexandria{\textquoteright}s mechanical machines and automata. However, the quest for Artificial General Intelligence (AGI) is troubled with repeated failures of strategies and approaches throughout the history. This decade has seen a shift in interest towards bio-inspired software and hardware, with the assumption that such mimicry entails intelligence. Though these steps are fruitful in certain directions and have advanced automation, their singular design focus renders them highly inefficient in achieving AGI. Which set of requirements have to be met in the design of AGI? What are the limits in the design of the artificial? Here, a careful examination of computation in biological systems hints that evolutionary tinkering of contextual processing of information enabled by a hierarchical architecture is the key to build AGI.
}, url = {https://arxiv.org/pdf/1703.02245}, author = {Nima Dehghani} } @conference {3547, title = {Detecting Semantic Parts on Partially Occluded Objects}, booktitle = {British Machine Vision Conference (BMVC)}, year = {2017}, month = {09/2017}, address = {London, UK}, abstract = {

In this paper, we address the task of detecting semantic parts on partially occluded objects. We consider a scenario where the model is trained using non-occluded images but tested on occluded images. The motivation is that there are infinite number of occlusion patterns in real world, which cannot be fully covered in the training data. So the models should be inherently robust and adaptive to occlusions instead of fitting / learning the occlusion patterns in the training data. Our approach detects semantic parts by accumulating the confidence of local visual cues. Specifically, the method uses a simple voting method, based on log-likelihood ratio tests and spatial constraints, to combine the evidence of local cues. These cues are called visual concepts, which are derived by clustering the internal states of deep networks. We evaluate our voting scheme on the VehicleSemanticPart dataset with dense part annotations. We randomly place two, three or four irrelevant objects onto the target object to generate testing images with various occlusions. Experiments show that our algorithm outperforms several competitors in semantic part detection when occlusions are present.

}, url = {https://bmvc2017.london/proceedings/}, author = {Jianyu Wang and Cihang Xie and Zhishuai Zhang and Jun Zhu and Lingxi Xie and Alan Yuille} } @article {3503, title = {Detecting Semantic Parts on Partially Occluded Objects}, year = {2017}, month = {09/2017}, abstract = {

In this paper, we address the task of detecting semantic parts on partially occluded objects. We consider a scenario where the model is trained using non-occluded images but tested on occluded images. The motivation is that there are infinite number of occlusion patterns in real world, which cannot be fully covered in the training data. So the models should be inherently robust and adaptive to occlusions instead of fitting / learning the occlusion patterns in the training data. Our approach detects semantic parts by accumulating the confidence of local visual cues. Specifically, the method uses a simple voting method, based on log-likelihood ratio tests and spatial constraints, to combine the evidence of local cues. These cues are called visual concepts, which are derived by clustering the internal states of deep networks. We evaluate our voting scheme on the VehicleSemanticPart dataset with dense part annotations. We randomly place two, three or four irrelevant objects onto the target object to generate testing images with various occlusions. Experiments show that our algorithm outperforms several competitors in semantic part detection when occlusions are present.

}, author = {Jianyu Wang and Cihang Xie and Zhishuai Zhang and Jun Zhu and Lingxi Xie and Alan Yuille} } @article {3466, title = {Differences in dynamic and static coding within different subdivision of the prefrontal cortex}, year = {2017}, month = {11/2017}, address = {Washington, DC}, abstract = {

A longstanding question in neuroscience concerns what is the neural basis underlying working memory. Early work showed that neurons in the prefrontal cortex (PFC) hold information in working memory by having sustained firing rates for extended periods of time, while more recent work has shown that many neurons in the PFC appear to be selective for shorter periods of time and thus information in working memory is contained in a dynamic population code (Meyers et al. 2008, 2012, Stokes et al. 2013). As more results have accumulated, it has become increasingly clear that different studies are leading to different results, with some studies showing predominantly static codes, while other show primarily dynamic codes (King and Dehaene, 2014), however it remains unclear what is leading to these different findings. One possibility is that different brain regions code information differently, and that different studies have recorded neural activity from different regions. To examine this possibility, we recording neural activity from five different subdivision of the PFC (posterior-dorsal, mid-dorsal, anterior-dorsal, posterior-ventral, anterior-ventral PFC) and compared the neural coding properties in these subdivisions. A total of 1856 neurons in four monkeys trained to perform spatial and shape working memory tasks were analyzed. Our results show striking differences in how these subdivisions code information, with some subdivisions containing a completely dynamic code, and other subdivisions containing a completely static code.\  These findings give a potential explanation for discrepancies in the literature and should lead to a deeper understanding of how information is stored in working memory.

}, url = {http://www.abstractsonline.com/pp8/$\#$!/4376/presentation/4782}, author = {Ethan Meyers and Mitchell Riley and Xue-Lian Qi and Christos Constantinidis} } @article {3465, title = {Differential Processing of Isolated Object and Multi-item Pop-Out Displays in LIP and PFC.}, journal = {Cerebral Cortex}, year = {2017}, month = {10/2017}, abstract = {

Objects that are highly distinct from their surroundings appear to visually "pop-out." This effect is present for displays in which: (1) a single cue object is shown on a blank background, and (2) a single cue object is highly distinct from surrounding objects; it is generally assumed that these 2 display types are processed in the same way. To directly examine this, we applied a decoding analysis to neural activity recorded from the lateral intraparietal (LIP) area and the dorsolateral prefrontal cortex (dlPFC). Our analyses showed that for the single-object displays, cue location information appeared earlier in LIP than in dlPFC. However, for the display with distractors, location information was substantially delayed in both brain regions, and information first appeared in dlPFC. Additionally, we see that pattern of neural activity is similar for both types of displays and across different color transformations of the stimuli, indicating that location information is being coded in the same way regardless of display type. These results lead us to hypothesize that 2 different pathways are involved processing these 2 types of pop-out displays.

}, keywords = {Attention, lateral intraparietal area, neural decoding, posterior parietal cortex, prefrontal cortex}, issn = {1047-3211}, doi = {10.1093/cercor/bhx243}, url = {https://academic.oup.com/cercor/advance-article/doi/10.1093/cercor/bhx243/4430784}, author = {Ethan Meyers and Andy Liang and Fumi Katsuki and Christos Constantinidis} } @article {2550, title = {Discriminate-and-Rectify Encoders: Learning from Image Transformation Sets}, year = {2017}, month = {03/2017}, abstract = {

The complexity of a learning task is increased by transformations in the input space that preserve class identity. Visual object recognition for example is affected by changes in viewpoint, scale, illumination or planar transformations. While drastically altering the visual appearance, these changes are orthogonal to recognition and should not be reflected in the representation or feature encoding used for learning. We introduce a framework for weakly supervised learning of image embeddings that are robust to transformations and selective to the class distribution, using sets of transforming examples (orbit sets), deep parametrizations and a novel orbit-based loss. The proposed loss combines a discriminative, contrastive part for orbits with a reconstruction error that learns to rectify orbit transformations. The learned embeddings are evaluated in distance metric-based tasks, such as one-shot classification under geometric transformations, as well as face verification and retrieval under more realistic visual variability. Our results suggest that orbit sets, suitably computed or observed, can be used for efficient, weakly-supervised learning of semantically relevant image embeddings.

}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos} } @article {2914, title = {Do Deep Neural Networks Suffer from Crowding?}, year = {2017}, month = {06/2017}, abstract = {

Crowding is a visual effect suffered by humans, in which an object that can be recognized in isolation can no longer be recognized when other objects, called flankers, are placed close to it. In this work, we study the effect of crowding in artificial Deep Neural Networks for object recognition. We analyze both standard deep convolutional neural networks (DCNNs) as well as a new version of DCNNs which is 1) multi-scale and 2) with size of the convolution filters change depending on the eccentricity wrt to the center of fixation. Such networks, that we call eccentricity-dependent, are a computational model of the feedforward path of the primate visual cortex. Our results reveal that the eccentricity-dependent model, trained on target objects in isolation, can recognize such targets in the presence of flankers, if the targets are near the center of the image, whereas DCNNs cannot. Also, for all tested networks, when trained on targets in isolation, we find that recognition accuracy of the networks decreases the closer the flankers are to the target and the more flankers there are. We find that visual similarity between the target and flankers also plays a role and that pooling in early layers of the network leads to more crowding. Additionally, we show that incorporating the flankers into the images of the training set does not improve performance with crowding.

Associated code for this paper.

}, author = {Anna Volokitin and Gemma Roig and Tomaso Poggio} } @article {2919, title = {Do Deep Neural Networks Suffer from Crowding? [code]}, year = {2017}, month = {06/2017}, abstract = {

This code accompanies the paper "Do Deep Neural Networks Suffer from Crowding?" by Anna Volokitin, Gemma Roig and Tomaso Poggio [1].

The main purpose of this repository is to provide an implementation of the eccentricity-dependent model [3], as well as to show an example of our experiments carried in [1]. This code is inspired by the implementation described in [2]. Yet, it is is not intended to replicate the results reported in [2].

The code is provided as is and is for academic purpose only.

Contact voanna AT vision.ee.ethz.ch and gemmar AT mit.edu for questions.

GitHub repository for this code for downloading, cloning, etc.

}, author = {Anna Volokitin and Gemma Roig} } @article {2485, title = {Eccentricity Dependent Deep Neural Networks for Modeling Human Vision}, year = {2017}, author = {Gemma Roig and Francis Chen and X Boix and Tomaso Poggio} } @conference {2487, title = {Eccentricity Dependent Deep Neural Networks: Modeling Invariance in Human Vision}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, abstract = {

Humans can recognize objects in a way that is invariant to scale, translation, and clutter. We use invariance theory as a conceptual basis, to computationally model this phenomenon. This theory discusses the role of eccentricity in human visual processing, and is a generalization of feedforward convolutional neural networks (CNNs). Our model explains some key psychophysical observations relating to invariant perception, while maintaining important similarities with biological neural architectures. To our knowledge, this work is the first to unify explanations of all three types of invariance, all while leveraging the power and neurological grounding of CNNs.

}, url = {https://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15360}, author = {Francis Chen and Gemma Roig and Leyla Isik and X Boix and Tomaso Poggio} } @article {3393, title = {Effect of silhouetting and inversion on view invariance in the monkey inferotemporal cortex}, journal = {Journal of Neurophysiology}, volume = {11823}, year = {2017}, month = {Jan-07-2017}, pages = {353 - 362}, issn = {0022-3077}, doi = {10.1152/jn.00008.2017}, url = {http://www.physiology.org/doi/10.1152/jn.00008.2017http://www.physiology.org/doi/pdf/10.1152/jn.00008.2017}, author = {N. Apurva Ratan Murty and Arun, S. P.} } @article {3088, title = {Eye-Tracking Causality}, journal = {Psychological Science}, volume = {73}, year = {2017}, month = {10/2017}, abstract = {

How do people make causal judgments? What role, if any, does counterfactual simulation play? Counterfactual theories of causal judgments predict that people compare what actually happened with what would have happened if the candidate cause had been absent. Process theories predict that people focus only on what actually happened, to assess the mechanism linking candidate cause and outcome. We tracked participants{\textquoteright} eye movements while they judged whether one billiard ball caused another one to go through a gate or prevented it from going through. Both participants{\textquoteright} looking patterns and their judgments demonstrated that counterfactual simulation played a critical role. Participants simulated where the target ball would have gone if the candidate cause had been removed from the scene. The more certain participants were that the outcome would have been different, the stronger the causal judgments. These results provide the first direct evidence for spontaneous counterfactual simulation in an important domain of high-level cognition.

}, keywords = {causality, counterfactuals, eye tracking, intuitive physics, mental simulation, open data, open materials}, issn = {0956-7976}, doi = {10.1177/0956797617713053}, url = {http://journals.sagepub.com/doi/10.1177/0956797617713053}, author = {Tobias Gerstenberg and M.F. Peterson and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @article {3444, title = {Eye-Tracking Causality}, journal = {Psychological Science}, year = {2017}, abstract = {

How do people make causal judgments? What role, if any, does counterfactual simulation play? Counterfactual theories of causal judgments predict that people compare what actually happened with what would have happened if the candidate cause had been absent. Process theories predict that people focus only on what actually happened, to assess the mechanism linking candidate cause and outcome. We tracked participants{\textquoteright} eye movements while they judged whether one billiard ball caused another one to go through a gate or prevented it from going through. Both participants{\textquoteright} looking patterns and their judgments demonstrated that counterfactual simulation played a critical role. Participants simulated where the target ball would have gone if the candidate cause had been removed from the scene. The more certain participants were that the outcome would have been different, the stronger the causal judgments. These results provide the first direct evidence for spontaneous counterfactual simulation in an important domain of high-level cognition.

}, keywords = {causality, counterfactuals, eye tracking, intuitive physics, mental simulation}, author = {Tobias Gerstenberg and M.F. Peterson and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @article {3274, title = {A fast, invariant representation for human action in the visual system.}, journal = {J Neurophysiol}, year = {2017}, month = {11/2017}, pages = {jn.00642.2017}, abstract = {

Humans can effortlessly recognize others{\textquoteright} actions in the presence of complex transformations, such as changes in viewpoint. Several studies have located the regions in the brain involved in invariant action recognition, however, the underlying neural computations remain poorly understood. We use magnetoencephalography (MEG) decoding and a dataset of well-controlled, naturalistic videos of five actions (run, walk, jump, eat, drink) performed by different actors at different viewpoints to study the computational steps used to recognize actions across complex transformations. In particular, we ask when the brain discriminates between different actions, and when it does so in a manner that is invariant to changes in 3D viewpoint. We measure the latency difference between invariant and non-invariant action decoding when subjects view full videos as well as form-depleted and motion-depleted stimuli. We were unable to detect a difference in decoding latency or temporal profile between invariant and non-invariant action recognition in full videos. However, when either form or motion information is removed from the stimulus set, we observe a decrease and delay in invariant action decoding. Our results suggest that the brain recognizes actions and builds invariance to complex transformations at the same time, and that both form and motion information are crucial for fast, invariant action recognition.

}, keywords = {action recognition, magnetoencephalography, neural decoding, vision}, issn = {1522-1598}, doi = {10.1152/jn.00642.2017}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @proceedings {2535, title = {Faulty Towers: A counterfactual simulation model of physical support}, year = {2017}, month = {07/2017}, abstract = {

In this paper we extend the counterfactual simulation model (CSM)\  {\textendash}\  originally\  developed\  to\  capture\  causal\  judgments about\  dynamic\  events\  (Gerstenberg,\  Goodman,\  Lagnado,\  \& Tenenbaum, 2014) {\textendash} to explain judgments of physical support. The CSM predicts that people judge physical support by men- tally\  simulating\  what\  would\  happen\  if\  the\  object\  of\  interest were removed. Two experiments test the model by asking par- ticipants to evaluate the extent to which one brick in a tower is responsible for the rest of the bricks staying on the table. The results of both experiments show a very close correspon- dence\  between\  counterfactual\  simulations\  and\  responsibility judgments. We compare three versions of the CSM which dif- fer in how they model people{\textquoteright}s uncertainty about what would have happened. Participants{\textquoteright} selections of which bricks would fall are best explained by assuming that counterfactual inter- ventions only affect some aspects while leaving the rest of the scene unchanged.

}, keywords = {causality, counterfactual, intuitive physics, mental simulation, support}, author = {Tobias Gerstenberg and Liang Zhou and Kevin A Smith and Joshua B. Tenenbaum} } @article {3155, title = {Fisher-Rao Metric, Geometry, and Complexity of Neural Networks}, year = {2017}, month = {11/2017}, abstract = {

We study the relationship between geometry and capacity measures for deep\  neural\  networks\  from\  an\  invariance\  viewpoint.\  We\  introduce\  a\  new notion\  of\  capacity {\textemdash} the\  Fisher-Rao\  norm {\textemdash} that\  possesses\  desirable\  in- variance properties and is motivated by Information Geometry. We discover an analytical characterization of the new capacity measure, through which we establish norm-comparison inequalities and further show that the new measure serves as an umbrella for several existing norm-based complexity measures.\  We\  discuss\  upper\  bounds\  on\  the\  generalization\  error\  induced by\  the\  proposed\  measure.\  Extensive\  numerical\  experiments\  on\  CIFAR-10 support\  our\  theoretical\  findings.\  Our\  theoretical\  analysis\  rests\  on\  a\  key structural lemma about partial derivatives of multi-layer rectifier networks.

}, keywords = {capacity control, deep learning, Fisher-Rao metric, generalization error, information geometry, Invariance, natural gradient, ReLU activation, statistical learning theory}, url = {https://arxiv.org/abs/1711.01530}, author = {Liang, Tengyuan and Tomaso Poggio and Alexander Rakhlin and Stokes, James} } @article {3477, title = {Five-month-old infants attend to responsive caregivers}, year = {2017}, address = {Portland, OR}, abstract = {

Toddlers are sensitive to comforting interactions in animated events with geometric forms of different sizes that first move together, then separate, prompting the smaller form to emit a baby{\textquoteright}s cry (Johnson et al., 2007), and they expect adults who comfort the same crying baby to engage with one another (Spokes \& Spelke, 2017), but an earlier sensitivity to comforting interactions is unknown. Two OSF- preregistered experiments (N=32) asked if 5-month-old infants prefer an adult who comforts a crying baby over one who does not. In Experiment 1, infants viewed alternating events in which a baby cried, and two adults responded by approaching or fleeing the baby, moving the same distances in different directions. When the adults then appeared together without the baby for one 20-sec visual preference trial, infants looked longer to the responsive adult, M = 0.608, SD = 0.206, t(15) = 2.167, p = 0.047, Figure 1. Experiment 2 replaced the crying baby with a car emitting a siren noise, comparable in salience to a baby{\textquoteright}s cry. Infants looked as long at approach and avoid events as in Exp. 1 but showed no test preference for the adult who approached the car, M = 0.486, SD = 0.212, t(15) = 0.264, p = 0.795, Figure 1. Infants{\textquoteright} looking patterns in Exp. 1 thus cannot be explained by a general preference for an approaching over avoiding adult and suggest that infants attend to more responsive caregivers before they can approach or use language to communicate with their own social partners.

}, url = {https://cogdevsoc.org/wp-content/uploads/2017/10/CDS2017AbstractBook.pdf}, author = {A C Spokes and Tara Venkatesan and Elizabeth S Spelke} } @article {2908, title = {On the Forgetting of College Academics: at "Ebbinghaus Speed"?}, year = {2017}, month = {06/2017}, abstract = {

How important are Undergraduate College Academics after graduation? How much do we actually remember after we leave the college classroom, and for how long? Taking a look at major University ranking methodologies one can easily observe they consistently lack any objective measure of what content knowledge and skills students retain from college education in the long term. Is there any rigorous scholarly published evidence on retention of long-term unused academic content knowledge? We have found no such evidence based on a preliminary literature review. Furthermore, findings in all research papers reviewed in this study were consistent with the following assertion: the Ebbinghaus forgetting curve [Ebbinghaus 1880-1885] is a fundamental law of human nature {\textendash} in fact, of the whole animal kingdom and applies to memory of all types: verbal, visual, abstract, social and autobiographical. This fundamental law of nature, when examined within the context of academic
learning retention, manifests itself as an exponential curve halving memory saliency about every two years (what we call "Ebbinghaus Speed"). This paper presents the research group{\textquoteright}s initial hypothesis and conjectures for college level education programming and curriculum development, suggestions for instructional design enhancing learning durability, as well as future research directions.

}, author = {Brian Subirana and Aikaterini Bagiati and Sanjay Sarma} } @article {2716, title = {Formalizing emotion concepts within a Bayesian model of theory of mind}, journal = {Current Option in Psychology}, volume = {17}, year = {2017}, month = {10/2017}, pages = {15-21}, chapter = {15}, abstract = {

Sensitivity to others{\textquoteright} emotions is foundational for many aspects of human life, yet computational models do not currently approach the sensitivity and specificity of human emotion knowledge. Perception of isolated physical expressions largely supplies ambiguous, low-dimensional, and noisy information about others{\textquoteright} emotional states. By contrast, observers attribute specific granular emotions to another person based on inferences of how she interprets (or {\textquotedblleft}appraises{\textquotedblright}) external events in relation to her other mental states (goals, beliefs, moral values, costs). These attributions share neural mechanisms with other reasoning about minds. Situating emotion concepts in a formal model of people{\textquoteright}s intuitive theories about other minds is necessary to effectively capture humans{\textquoteright} fine-grained emotion understanding.

}, keywords = {appraisal, bayes, emotion, inference, perception}, doi = {https://doi.org/10.1016/j.copsyc.2017.04.019}, url = {http://www.sciencedirect.com/science/article/pii/S2352250X17300283}, author = {Rebecca Saxe and Sean Dae Houlihan} } @proceedings {3478, title = {Four-year-old children favor kin when the stakes are higher}, year = {2017}, address = {Portland, OR}, abstract = {

Only in cases when the stakes are high--donating a kidney or risking injury to rescue someone in peril-- do adults report more willingness to help siblings over close friends (Stewart-Williams, 2007). When people are dividing plentiful, low-value resources, children expect them to share equally with friend and siblings (Olson \& Spelke, 2008). However, will children show a kinship preference when the stakes are higher? We first tested young children{\textquoteright}s relative favoring of kin versus friends and strangers in distributing limited resources--one item instead of many (Spokes \& Spelke, 2016). We found that 3- to 5- year-old children (n=252) shared more with kin and friends than with strangers but did not favor kin over friend, either when reasoning about fictional characters (Experiments 1, 3) or about their own friends and family (Experiment 2). This pattern of results could have occurred for two reasons: first, young children do not yet have the kinship index mechanisms that guide adults{\textquoteright} recent altruistic favors and reported likelihood of donating an organ to siblings (Lieberman, Tooby \& Cosmides, 2007). Second, the hypothetical costs and rewards used may not be relevant or valuable to children. To distinguish between these hypotheses, we asked whether children would show a preference for kin if the cost was more relevant to them--their own time and effort. In the present experiment, we asked if children would work harder for kin over non-kin when playing a challenging geometry game (Dillon, Huang, \& Spelke, 2013). Each round, they could earn stickers for a different recipient: themselves, a parent, sibling, friend, or an unfamiliar child. Children could end the round whenever they wanted. We measured the number of trials played, trials answered correctly, and duration of play. Data for the number of trials and duration played were log-normally distributed, so we log transformed these variables prior to analyses (Csibra, Hernik, Mascaro, Tatone, \& Lengyel, 2016). Across these measures, one-way ANOVAs revealed that four-year-olds (n=24) played more trials for their kin relations--siblings and parents--than for non-kin--friends and strangers, F(1, 46) = 4.27, p = .044, answered more trials correctly, F(1, 46) = 4.57, p = .038, and played marginally longer, F(1, 46) = 3.14, p = .083. There was no main effect of recipient when comparing across all four recipients nor significant pairwise comparisons. Five-year-olds (n=24) did not differ when playing for kin versus non-kin (ps \> .05). These findings provide initial evidence that four-year-old children calibrate their time and effort in a task differently according to who will reap the rewards, but five-year-olds do not. Five-year-olds may find the task easier and less costly or may have different social experiences having attended school. Nonetheless, we found that children{\textquoteright}s social decisions depend upon the recipient of their generosity. We provide initial evidence that children may favor kin when the stakes are higher and resources--their time and effort--are more meaningful to them: four-year-olds played more trials and did so more accurately when winning for kin.

}, url = {https://cogdevsoc.org/wp-content/uploads/2017/10/CDS2017AbstractBook.pdf}, author = {A C Spokes and Elizabeth S Spelke} } @article {2614, title = {From agents to actions to interactions: Uncovering multiple social networks in the primate brain}, year = {2017}, author = {J. Sliwa and W. A. Freiwald} } @article {2408, title = {Full interpretation of minimal images}, year = {2017}, month = {02/2017}, abstract = {

The goal in this work is to model the process of {\textquoteleft}full interpretation{\textquoteright} of object images, which is the ability to identify and localize all semantic features and parts that are recognized by human observers. The task is approached by dividing the interpretation of the complete object to the interpretation of multiple reduced but interpretable local regions. In such reduced regions, interpretation is simpler, since the number of semantic components is small, and the variability of possible configurations is low.

We model the interpretation process by identifying primitive components and relations that play a useful role in local interpretation by humans. To identify useful components and relations used in the interpretation process, we consider the interpretation of {\textquoteleft}minimal configurations{\textquoteright}: these are reduced local regions, which are minimal in the sense that further reduction renders them unrecognizable and uninterpretable. We show that such minimal interpretable images have useful properties, which we use to identify informative features and relations used for full interpretation. We describe our interpretation model, and show results of detailed interpretations of minimal configurations, produced automatically by the model. Finally, we discuss implications of full interpretation to difficult visual tasks, such as recognizing human activities or interactions, which are beyond the scope of current models of visual recognition.

This manuscript has beed accepted for publication in Cognition.

}, keywords = {Image interpretation, Parts and relations, Visual object recognition}, author = {Guy Ben-Yosef and Liav Assif and Shimon Ullman} } @conference {3575, title = {Generative modeling of audible shapes for object perception}, booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

Humans infer rich knowledge of objects from both auditory and visual cues. Building a machine of such competency, however, is very challenging, due to the great difficulty in capturing large-scale, clean data of objects with both their appearance and the sound they make. In this paper, we present a novel, open-source pipeline that generates audio-visual data, purely from 3D object shapes and their physical properties. Through comparison with audio recordings and human behavioral studies, we validate the accuracy of the sounds it generates. Using this generative model, we are able to construct a synthetic audio-visual dataset, namely Sound-20K, for object perception tasks. We demonstrate that auditory and visual information play complementary roles in object perception, and further, that the representation learned on synthetic audio-visual data can transfer to real-world scenarios.

}, url = {http://openaccess.thecvf.com/content_iccv_2017/html/Zhang_Generative_Modeling_of_ICCV_2017_paper.html}, author = {Zhoutong Zhang and Jiajun Wu and Qiujia Li and Zhengjia Huang and James Traer and Josh H. McDermott and Joshua B. Tenenbaum and William T. Freeman} } @conference {3639, title = {Human Learning in Atari}, booktitle = {AAAI Spring Symposium Series}, year = {2017}, abstract = {

Atari games are an excellent testbed for studying intelligent behavior, as they offer a range of tasks that differ widely in their visual representation, game dynamics, and goals presented to an agent. The last two years have seen a spate of research into artificial agents that use a single algorithm to learn to play these games. The best of these artificial agents perform at better-than-human levels on most games, but require hundreds of hours of game-play experience to produce such behavior. Humans, on the other hand, can learn to perform well on these tasks in a matter of minutes. In this paper we present data on human learning trajectories for several Atari games, and test several hypotheses about the mechanisms that lead to such rapid learning.\ 

}, author = {Pedro Tsividis and Thomas Pouncy and Jacqueline L. Xu and Joshua B. Tenenbaum and Samuel J Gershman} } @article {2484, title = {On the Human Visual System Invariance to Translation and Scale}, year = {2017}, author = {Yena Han and Gemma Roig and Gadi Geiger and Tomaso Poggio} } @conference {2486, title = {Is the Human Visual System Invariant to Translation and Scale?}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, author = {Yena Han and Gemma Roig and Gadi Geiger and Tomaso Poggio} } @article {3063, title = {Infants make more attempts to achieve a goal when they see adults persist}, journal = {Science}, volume = {357}, year = {2017}, month = {Oct-09-2018}, pages = {1290 - 1294}, abstract = {

Persistence, above and beyond IQ, is associated with long-term academic outcomes. To look at the effect of adult models on infants{\textquoteright} persistence, we conducted an experiment in which 15-month-olds were assigned to one of three conditions: an Effort condition in which they saw an adult try repeatedly, using various methods, to achieve each of two different goals; a No Effort condition in which the adult achieved the goals effortlessly; or a Baseline condition. Infants were then given a difficult, novel task. Across an initial study and two preregistered experiments (N = 262), infants in the Effort condition made more attempts to achieve the goal than did infants in the other conditions. Pedagogical cues modulated the effect. The results suggest that adult models causally affect infants{\textquoteright} persistence and that infants can generalize the value of persistence to novel tasks.

}, issn = {0036-8075}, doi = {10.1126/science.aan2317}, url = {http://www.sciencemag.org/lookup/doi/10.1126/science.aan2317https://syndication.highwire.org/content/doi/10.1126/science.aan2317https://syndication.highwire.org/content/doi/10.1126/science.aan2317}, author = {Leonard, Julia A. and Lee, Yuna and Laura Schulz} } @article {2733, title = {Inferring Beliefs and Desires From Emotional Reactions to Anticipated and Observed Events}, journal = {Child Development}, year = {2017}, month = {03/2017}, abstract = {

Researchers have long been interested in the relation between emotion understanding and theory of mind. This study investigates a cue to mental states that has rarely been investigated: the dynamics of valenced emotional expressions. When the valence of a character{\textquoteright}s facial expression was stable between an expected and observed outcome, children (N\ =\ 122; M\ =\ 5.0\ years) recovered the character{\textquoteright}s desires but did not consistently recover her beliefs. When the valence changed, older but not younger children recovered both the characters{\textquoteright} beliefs and desires. In contrast, adults jointly recovered agents{\textquoteright} beliefs and desires in all conditions. These results suggest that the ability to infer mental states from the dynamics of emotional expressions develops gradually through early and middle childhood.

}, doi = {10.1111/cdev.12759}, url = {http://onlinelibrary.wiley.com/doi/10.1111/cdev.12759/abstract}, author = {Yang Wu and Laura Schulz} } @article {3162, title = {Invariant action recognition dataset}, year = {2017}, month = {11/2017}, abstract = {

To study the effect of changes in view and actor on action recognition, we filmed a dataset of five actors performing five different actions (drink, eat, jump, run and walk) on a treadmill from five different views (0, 45, 90, 135, and 180 degrees from the front of the actor/treadmill; the treadmill rather than the camera was rotated in place to acquire from different viewpoints). The dataset was filmed on a fixed, constant background. To avoid low-level object/action confounds (e.g. the action {\textquotedblleft}drink{\textquotedblright} being classified as the only videos with water bottle in the scene) and guarantee that the main sources of variation of visual appearance are due to actions, actors and viewpoint, the actors held the same objects (an apple and a water bottle) in each video, regardless of the action they performed. This controlled design allows us to test hypotheses on the computational mechanisms underlying invariant recognition in the human visual system without having to settle for a synthetic dataset.

More information and the dataset files can be found here - https://doi.org/10.7910/DVN/DMT0PG

}, url = {https://doi.org/10.7910/DVN/DMT0PG}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {3453, title = {Invariant recognition drives neural representations of action sequences}, journal = {PLoS Comp. Bio}, year = {2017}, abstract = {

Recognizing the actions of others from visual stimuli is a crucial aspect of human perception that allows individuals to respond to social cues. Humans are able to discriminate between similar actions despite transformations, like changes in viewpoint or actor, that substantially alter the visual appearance of a scene. This ability to generalize across complex transformations is a hallmark of human visual intelligence. Advances in understanding action recognition at the neural level have not always translated into precise accounts of the computational principles underlying what representations of action sequences are constructed by human visual cortex. Here we test the hypothesis that invariant action discrimination might fill this gap. Recently, the study of artificial systems for static object perception has produced models, Convolutional Neural Networks (CNNs), that achieve human level performance in complex discriminative tasks. Within this class, architectures that better support invariant object recognition also produce image representations that better match those implied by human and primate neural data. However, whether these models produce representations of action sequences that support recognition across complex transformations and closely follow neural representations of actions remains unknown. Here we show that spatiotemporal CNNs accurately categorize video stimuli into action classes, and that deliberate model modifications that improve performance on an invariant action recognition task lead to data representations that better match human neural recordings. Our results support our hypothesis that performance on invariant discrimination dictates the neural representations of actions computed in the brain. These results broaden the scope of the invariant recognition framework for understanding visual intelligence from perception of inanimate objects and faces in static images to the study of human perception of action sequences.

Associated Dataset: MEG action recognition data

}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {3272, title = {Invariant recognition drives neural representations of action sequences}, journal = {PLOS Computational Biology}, volume = {13}, year = {2017}, month = {12/2017}, pages = {e1005859}, abstract = {

Recognizing the actions of others from visual stimuli is a crucial aspect of human perception that allows individuals to respond to social cues. Humans are able to discriminate between similar actions despite transformations, like changes in viewpoint or actor, that substantially alter the visual appearance of a scene. This ability to generalize across complex transformations is a hallmark of human visual intelligence. Advances in understanding action recognition at the neural level have not always translated into precise accounts of the computational principles underlying what representations of action sequences are constructed by human visual cortex. Here we test the hypothesis that invariant action discrimination might fill this gap. Recently, the study of artificial systems for static object perception has produced models, Convolutional Neural Networks (CNNs), that achieve human level performance in complex discriminative tasks. Within this class, architectures that better support invariant object recognition also produce image representations that better match those implied by human and primate neural data. However, whether these models produce representations of action sequences that support recognition across complex transformations and closely follow neural representations of actions remains unknown. Here we show that spatiotemporal CNNs accurately categorize video stimuli into action classes, and that deliberate model modifications that improve performance on an invariant action recognition task lead to data representations that better match human neural recordings. Our results support our hypothesis that performance on invariant discrimination dictates the neural representations of actions computed in the brain. These results broaden the scope of the invariant recognition framework for understanding visual intelligence from perception of inanimate objects and faces in static images to the study of human perception of action sequences.

}, doi = {10.1371/journal.pcbi.1005859}, url = {http://dx.plos.org/10.1371/journal.pcbi.1005859}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio}, editor = {Berniker, Max} } @inbook {2562, title = {Invariant Recognition Predicts Tuning of Neurons in Sensory Cortex}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, pages = {85-104}, publisher = {Springer}, organization = {Springer}, issn = {978-981-10-0211-3}, author = {Jim Mutch and F. Anselmi and Andrea Tacchetti and Lorenzo Rosasco and JZ. Leibo and Tomaso Poggio} } @article {2751, title = {Investigating audition with a generative model of impact sounds}, year = {2017}, author = {James Traer and Josh H. McDermott} } @article {2785, title = {Learning a commonsense moral theory}, year = {2017} } @article {2386, title = {Learning Mid-Level Auditory Codes from Natural Sound Statistics}, year = {2017}, month = {01/2017}, abstract = {

Interaction with the world requires an organism to transform sensory signals into representations in which behaviorally meaningful properties of the environment are made explicit. These representations are derived through cascades of neuronal processing stages in which neurons at each stage recode the output of preceding stages. Explanations of sensory coding may thus involve understanding how low-level patterns are combined into more complex structures. Although models exist in the visual domain to explain how mid-level features such as junctions and curves might be derived from oriented filters in early visual cortex, little is known about analogous grouping principles for mid-level auditory representations. We propose a hierarchical generative model of natural sounds that learns combinations of spectrotemporal features from natural stimulus statistics. In the first layer the model forms a sparse convolutional code of spectrograms using a dictionary of learned spectrotemporal kernels. To generalize from specific kernel activation patterns, the second layer encodes patterns of time-varying magnitude of multiple first layer coefficients. Because second-layer features are sensitive to combinations of spectrotemporal features, the representation they support encodes more complex acoustic patterns than the first layer. When trained on corpora of speech and environmental sounds, some second-layer units learned to group spectrotemporal features that occur together in natural sounds. Others instantiate opponency between dissimilar sets of spectrotemporal features. Such groupings might be instantiated by neurons in the auditory cortex, providing a hypothesis for mid-level neuronal computation.

}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2666, title = {Learning Mid-Level Codes for Natural Sounds}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @proceedings {3240, title = {Learning to See Physics via Visual De-animation}, year = {2017}, month = {12/2017}, pages = {152{\textendash}163}, abstract = {
We introduce a paradigm for understanding physical scenes without human annotations. At the core of our system is a physical world representation that is first recovered by a perception module and then utilized by physics and graphics
engines. During training, the perception module and the generative models learn by visual de-animation
{\textemdash} interpreting and reconstructing the visual information stream. During testing, the system first recovers the physical world state, and then uses the generative models for reasoning and future prediction.
Even more so than forward simulation, inverting a physics or graphics engine is a computationally hard problem; we overcome this challenge by using a convolutional inversion network. Our system quickly recognizes the physical world
state from appearance and motion cues, and has the flexibility to incorporate both differentiable and non-differentiable physics and graphics engines. We evaluate our system on both synthetic and real datasets involving multiple physical scenes, and demonstrate that our system performs well on both physical state estimation and reasoning problems. We further show that the knowledge learned on the synthetic dataset generalizes to constrained real images.
}, url = {http://papers.nips.cc/paper/6620-learning-to-see-physics-via-visual-de-animation.pdf}, author = {Jiajun Wu and Lu, Erika and Kohli, Pushmeet and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @article {2750, title = {A library of real-world reverberation and a toolbox for its analysis and measurement}, year = {2017}, author = {James Traer and Josh H. McDermott} } @article {2609, title = {Like adults, children make consistent welfare tradeoff allocations}, year = {2017}, address = {Austin, Texas}, author = {A C Spokes and Howard, R and S A Mehr and Krasnow, M M} } @conference {2607, title = {Like Adults, children make consistent welfare tradeoff allocations}, booktitle = {Budapest CEU Conference on Cognitive Development}, year = {2017}, address = {Budapest, Hungary}, author = {A C Spokes and Howard, R and S A Mehr and Krasnow, M M} } @article {2810, title = {Local field potentials primarily reflect inhibitory neuron activity in human and monkey cortex}, journal = {Nature Scientific Reports}, year = {2017}, abstract = {

The local field potential (LFP) is generated by large populations of neurons, but unitary contribution of spiking neurons to LFP is not well characterized. We investigated this contribution in multi-electrode array recordings from the cerebral cortex of human and monkey by calculating the spike-triggered LFP average (st-LFP). The resulting st-LFPs were dominated by broad spatio-temporal components due to on-going activity, synaptic inputs and recurrent connectivity. To observe the local field of a single spike we applied spatial filtering. The filtered st-LFPs were limited to an area of 800 μm from the neuron, and propagated at axonal speed, which is consistent with their unitary nature. In addition, we discriminated between putative inhibitory and excitatory neurons and found that the former dominated this unitary LFP contribution, consistently with previous findings in hippocampal slices. Thus, in human and monkey cortex, the LFP may primarily reflect inhibitory neuron activity.

}, doi = {10.1038/srep40211}, url = {http://www.nature.com/articles/srep40211}, author = {Bartosz Telenczuk and Nima Dehghani and Michel Le Van Quyen and Sydney Cash and Eric Halgren and Nicholas Hatsopoulos and Alain Destexhe} } @article {2499, title = {Lookit (Part 1): a new online platform for developmental research}, journal = {Open Mind}, volume = {1}, year = {2017}, month = {03/2017}, author = {Scott, K M and Laura Schulz} } @article {2500, title = {Lookit (Part 2): Assessing the viability of online developmental research, Results from three case studies}, journal = {Open Mind}, volume = {1}, year = {2017}, month = {03/2017}, author = {Scott, K M and Chu, J and Laura Schulz} } @article {2667, title = {Lossy Compression of Uninformative Stimuli in the Auditory System}, year = {2017}, author = {Wiktor Mlynarski and Josh H. McDermott} } @proceedings {2536, title = {Marbles in inaction: Counterfactual simulation and causation by omission}, year = {2017}, month = {07/2017}, abstract = {

Consider\  the\  following\  causal\  explanation:\ \  The\  ball\  went through the goal because the defender didn{\textquoteright}t block it.\  There are at least two problems with citing omissions as causal ex- planations.\ \  First,\  how\  do\  we\  choose\  the\  relevant\  candidate omission (e.g. why the defender and not the goalkeeper). Sec- ond, how do we determine what would have happened in the relevant\  counterfactual\  situation\  (i.e.\ \  maybe\  the\  shot\  would still have gone through the goal even if it had been blocked). In this paper, we extend the counterfactual simulation model (CSM) of causal judgment (Gerstenberg, Goodman, Lagnado, \& Tenenbaum, 2014) to handle the second problem. In two ex- periments, we show how people{\textquoteright}s causal model of the situation affects their causal judgments via influencing what counterfac- tuals\  they\  consider.\ \  Omissions\  are\  considered\  causes\  to\  the extent that the outcome in the relevant counterfactual situation would have been different from what it actually was.

}, author = {Simon Stephan and Pascale Willemsen and Tobias Gerstenberg} } @proceedings {2793, title = {Markov transitions between attractor states in a recurrent neural network}, year = {2017}, abstract = {

Stochasticity is an essential part of explaining the world. Increasingly, neuroscientists and cognitive scientists are identifying mechanisms whereby the brain uses probabilistic reasoning in representational, predictive, and generative settings. But stochasticity is not always useful: robust perception and memory retrieval require representations that are immune to corruption by stochastic noise. In an effort to combine these robust representations with stochastic computation, we present an architecture that generalizes traditional recurrent attractor networks to follow probabilistic Markov dynamics between stable and noise-resistant fixed points.

}, author = {Ishita Dasgupta and Jeremy Bernstein and David Rolnick and Haim Sompolinsky} } @proceedings {3241, title = {MarrNet: 3D Shape Reconstruction via 2.5D Sketches}, year = {2017}, month = {12/2017}, pages = {540{\textendash}550}, publisher = {Curran Associates, Inc.}, address = {Long Beach, CA}, abstract = {

3D object reconstruction from a single image is a highly under-determined problem, requiring strong prior knowledge of plausible 3D shapes. This introduces challenge for learning-based approaches, as 3D object annotations in real images are scarce. Previous work chose to train on synthetic data with ground truth 3D information, but suffered from the domain adaptation issue when tested on real data. In this work, we propose an end-to-end trainable framework, sequentially estimating 2.5D sketches and 3D object shapes. Our disentangled, two-step formulation has three advantages. First, compared to full 3D shape, 2.5D sketches are much easier to be recovered from a 2D image, and to transfer from synthetic to real data. Second, for 3D reconstruction from the 2.5D sketches, we can easily transfer the learned model on synthetic data to real images, as rendered 2.5D sketches are invariant to object appearance variations in real images, including lighting, texture, etc. This further relieves the domain adaptation problem. Third, we derive differentiable projective functions from 3D shape to 2.5D sketches, making the framework end-to-end trainable on real images, requiring no real-image annotations. Our framework achieves state-of-the-art performance on 3D shape reconstruction.

}, url = {http://papers.nips.cc/paper/6657-marrnet-3d-shape-reconstruction-via-25d-sketches.pdf}, author = {Jiajun Wu and Wang, Yifan and Xue, Tianfan and Sun, Xingyuan and William T. Freeman and Joshua B. Tenenbaum}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @article {3512, title = {Mind Games: Game Engines as an Architecture for Intuitive Physics}, journal = {Trends in Cognitive Science}, volume = {21}, year = {2017}, month = {09/2017}, pages = {649 - 665}, chapter = {649}, issn = {1364-6613}, doi = {10.1016/j.tics.2017.05.012}, url = {https://www.cell.com/trends/cognitive-sciences/fulltext/S1364-6613(17)30113-4}, author = {Ullman, Tomer D. and Elizabeth S Spelke and Battaglia, Peter and Joshua B. Tenenbaum} } @conference {2724, title = {A model for interpreting social interactions in local image regions}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, month = {03/2017}, address = {Palo Alto, CA}, abstract = {
Understanding social interactions (such as {\textquoteleft}hug{\textquoteright} or {\textquoteleft}fight{\textquoteright}) is a basic and important capacity of the human visual system, but a challenging and still open problem for modeling. In this work we study visual recognition of social interactions, based on small but recognizable local regions. The approach is based on two novel key components: (i) A given social interaction can be recognized reliably from reduced images (called {\textquoteleft}minimal images{\textquoteright}). (ii) The recognition of a social interaction depends on identifying components and relations within the minimal image (termed {\textquoteleft}interpretation{\textquoteright}). We show psychophysics data for minimal images and modeling results for their interpretation. We discuss the integration of minimal configurations in recognizing social interactions in a detailed, high-resolution image.
}, url = {http://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15354}, author = {Guy Ben-Yosef and Alon Yachin and Shimon Ullman} } @article {2799, title = {Modeling brain dynamics using mathematics from quantum mechanics}, volume = {Boston University}, year = {2017}, author = {David Theurel} } @article {2721, title = {Modeling emotion attributions as inference in an intuitive theory of mind.}, year = {2017}, address = {University of Wisconsin - Madison}, abstract = {

We model how people make third party emotion attributions as integration of perceptual cues and conceptual event knowledge in an intuitive causal theory of mind. Novel stimuli generated from a televised gameshow provide authentic (not staged) dynamic displays of emotion in the context of a quantifiable and repeatable game (a one-shot prisoner{\textquoteright}s dilemma). The gameshow involves public acts of cooperation, commitment, and betrayal, with stakes spanning five orders of magnitude (max ≈ $200,000), and thus supports a wide range of inferred emotions. The raw footage is separated into expression cues and contextual descriptions such that each player{\textquoteright}s emotions can be inferred from the player{\textquoteright}s reactions to the outcome (i.e. facial expressions and body postures), or from the event context (i.e. stakes, actions, and outcomes), or from both information sources together. Study participants attribute the experience of 20 nuanced emotions to the contestants based on (i) only dynamic visual emotion cues, (ii) only event descriptions, and (iii) combined dynamic visual cues and event descriptions. Principal component analysis and hierarchical clustering of emotion ratings are used to assess the dimensionality and structure of the attribution space supported by each unimodal signal as well as by the multimodal signal. The attributions are modeled using general linear regression with a priori features derived from behavioral economics and experimental psychology, including prospect theory, expected utility, and loss aversion. Finally, a Bayesian generative cue-combination model tests how effective joint conditioning on the unimodal signals is in explaining the emotion inferences participants make when given multimodal information.

}, keywords = {attribution, bayes, emotion, inference, inverse, perception}, author = {Sean Dae Houlihan and Rebecca Saxe} } @article {2737, title = {Moral alchemy: How love changes norms}, journal = {Cognition}, volume = {167}, year = {2017}, month = {10/2017}, pages = {135 -150}, chapter = {135}, abstract = {

We discuss a process by which non-moral concerns (that is concerns agreed to be non-moral

within a particular cultural context) can take on moral content. We refer to this phenomenon as moral alchemy and suggest that it arises because moral obligations of care entail recursively valuing loved ones{\textquoteright} values, thus allowing propositions with no moral weight in themselves to become morally charged. Within this framework, we predict that when people believe a loved one cares about a behavior more than they do themselves, the moral imperative to care about the loved one{\textquoteright}s interests will raise the value of that behavior, such that people will be more likely to infer that third parties will see the behavior as wrong (Experiment 1) and the behavior itself as more morally important (Experiment 2) than when the same behaviors are considered outside the context of a caring relationship. The current study confirmed these predictions.\ 

}, keywords = {Ethics of care, Moral learning, Recursive value, Utility}, doi = {https://doi.org/10.1016/j.cognition.2017.03.003}, url = {https://www.sciencedirect.com/science/article/pii/S0010027717300689}, author = {Rachel Magid and Laura Schulz} } @article {3633, title = {Multi-stage Multi-recursive-input Fully Convolutional Networks for Neuronal Boundary Detection}, year = {2017}, month = {10/2017}, abstract = {

In the field of connectomics, neuroscientists seek to identify cortical connectivity comprehensively. Neuronal boundary detection from the Electron Microscopy (EM) images is often done to assist the automatic reconstruction of neuronal circuit. But the segmentation of EM images is a challenging problem, as it requires the detector to be able to detect both filament-like thin and blob-like thick membrane, while suppressing the ambiguous intracellular structure. In this paper, we propose multi-stage multi-recursive-input fully convolutional networks to address this problem. The multiple recursive inputs for one stage, i.e., the multiple side outputs with different receptive field sizes learned from the lower stage, provide multi-scale contextual boundary information for the consecutive learning. This design is biologically-plausible, as it likes a human visual system to compare different possible segmentation solutions to address the ambiguous boundary issue. Our multi-stage networks are trained end-to-end. It achieves promising results on two public available EM segmentation datasets, the mouse piriform cortex dataset and the ISBI 2012 EM dataset.

}, author = {Wei Shen and Bin Wang and Yuan Jiang and Yan Wang and Alan Yuille} } @article {2780, title = {Musings on Deep Learning: Properties of SGD}, year = {2017}, month = {04/2017}, abstract = {

[formerly titled "Theory of Deep Learning III: Generalization Properties of SGD"]

In Theory III we characterize with a mix of theory and experiments the generalization properties of Stochastic Gradient Descent in overparametrized deep convolutional networks. We show that Stochastic Gradient Descent (SGD) selects with high probability solutions that 1) have zero (or small) empirical error, 2) are degenerate as shown in Theory II and 3) have maximum generalization.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Karthik Sridharan and Brando Miranda and Noah Golowich and Tomaso Poggio} } @article {3538, title = {A Network for Social interaction understanding in the primate brain}, year = {2017}, address = {Vancouver, Canada}, author = {Julia Sliwa and W. A. Freiwald} } @article {4066, title = {Neuronal population coding of perceived and memorized visual features in the lateral prefrontal cortex}, journal = {Nature Communications}, volume = {8}, year = {2017}, month = {June 2017}, type = {Article No. 15471}, abstract = {

The primate lateral prefrontal cortex (LPFC) encodes visual stimulus features while they are perceived and while they are maintained in working memory. However, it remains unclear whether perceived and memorized features are encoded by the same or different neurons and population activity patterns. Here we record LPFC neuronal activity while monkeys perceive the motion direction of a stimulus that remains visually available, or memorize the direction if the stimulus disappears. We find neurons with a wide variety of combinations of coding strength for perceived and memorized directions: some neurons encode both to similar degrees while others preferentially or exclusively encode either one. Reading out the combined activity of all neurons, a machine-learning algorithm reliably decode the motion direction and determine whether it is perceived or memorized. Our results indicate that a functionally diverse population of LPFC neurons provides a substrate for discriminating between perceptual and mnemonic representations of visual features.

}, url = {https://doi.org/10.1038/ncomms15471}, author = {Diego Mendoza-Halliday and Julio Martinez-Trujillo} } @article {2905, title = {Noninvasive Deep Brain Stimulation via Temporally Interfering Electric Fields}, journal = {Cell}, volume = {169}, year = {2017}, month = {Jan-06-2017}, pages = {1029 - 1041.e16}, abstract = {

We report a noninvasive strategy for electrically stimulating neurons at depth. By delivering to the brain multiple electric fields at frequencies too high to recruit neural firing, but which differ by a frequency within the dynamic range of neural firing, we can electrically stimulate neurons throughout a region where interference between the multiple fields results in a prominent electric field envelope modulated at the difference frequency. We validated this temporal interference (TI) concept via modeling and physics experiments, and verified that neurons in\ the living mouse brain could follow the electric field\ envelope. We demonstrate the utility of TI stimulation by stimulating neurons in the hippocampus of living mice without recruiting neurons of the overlying cortex. Finally, we show that by altering the currents delivered to a set of immobile electrodes, we can steerably evoke different motor patterns in living mice.

}, issn = {00928674}, doi = {10.1016/j.cell.2017.05.024}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0092867417305846http://api.elsevier.com/content/article/PII:S0092867417305846?httpAccept=text/xmlhttp://api.elsevier.com/content/article/PII:S0092867417305846?httpAccept=text/plain}, author = {Grossman, Nir and Bono, David and Dedic, Nina and Kodandaramaiah, Suhasa B. and Rudenko, Andrii and Suk, Ho-Jun and Antonino Cassara and Neufeld, Esra and Kuster, Niels and Tsai, Li-Huei and Pascual-Leone, Alvaro and Edward S Boyden} } @article {2566, title = {A null model for cortical representations with grandmothers galore}, journal = {Language, Cognition and Neuroscience}, year = {2017}, month = {08/2016}, pages = {274 - 285}, abstract = {

There has been extensive discussion in the literature about the extent to which cortical representations can be described as localist or distributed. Here, we discuss a simple null model that encompasses a family of related architectures describing the transformation of signals throughout the parts of the visual system involved in object recognition. This family of models constitutes a rigorous first approximation to explain the neurophysiological properties of ventral visual cortex. This null model contains both distributed and local representations throughout the entire hierarchy of computations and the responses of individual units are meaningful and interpretable when encoding is adequately defined for each computational stage.

}, keywords = {Computational models, human visual cortex, localist representation, sparse coding, visual recognition}, issn = {2327-3798}, doi = {10.1080/23273798.2016.1218033}, url = {https://www.tandfonline.com/doi/full/10.1080/23273798.2016.1218033}, author = {Gabriel Kreiman} } @article {3111, title = {Object-Oriented Deep Learning}, year = {2017}, month = {10/2017}, abstract = {

We investigate an unconventional direction of research that aims at converting neural networks, a class of distributed, connectionist, sub-symbolic models into a symbolic level with the ultimate goal of achieving AI interpretability and safety. To that end, we propose Object-Oriented Deep Learning, a novel computational paradigm of deep learning that adopts interpretable {\textquotedblleft}objects/symbols{\textquotedblright} as a basic representational atom instead of N-dimensional tensors (as in traditional {\textquotedblleft}feature-oriented{\textquotedblright} deep learning). For visual processing, each {\textquotedblleft}object/symbol{\textquotedblright} can explicitly package common properties of visual objects like its position, pose, scale, probability of being an object, pointers to parts, etc., providing a full spectrum of interpretable visual knowledge throughout all layers. It achieves a form of {\textquotedblleft}symbolic disentanglement{\textquotedblright}, offering one solution to the important problem of disentangled representations and invariance. Basic computations of the network include predicting high-level objects and their properties from low-level objects and binding/aggregating relevant objects together. These computations operate at a more fundamental level than convolutions, capturing convolution as a special case while being significantly more general than it. All operations are executed in an input-driven fashion, thus sparsity and dynamic computation per sample are naturally supported, complementing recent popular ideas of dynamic networks and may enable new types of hardware accelerations. We experimentally show on CIFAR-10 that it can perform flexible visual processing, rivaling the performance of ConvNet, but without using any convolution. Furthermore, it can generalize to novel rotations of images that it was not trained for.

}, author = {Qianli Liao and Tomaso Poggio} } @article {2739, title = {One- to Four-year-olds{\textquoteright} Ability to Connect Diverse Positive Emotional Expressions to Their Probable Causes }, year = {2017}, abstract = {

Adults have a sophisticated understanding of emotions; sufficiently sophisticated that English-speakers appreciate the distinction between feeling airy and animated, and terror and horror.\  To the degree that we make these distinctions, we represent not only the meaning of these emotion words, but also the causes and contexts that elicit them and the expressions and vocalizations that accompany them. The current study investigates how this rich understanding develops in childhood.

Previous research has found that infants can distinguish diverse emotional expressions and match emotional faces with their voices (e.g., Field, et al., 1982; Soderstrom, et al., 2015; Soken \& Pick, 1999; Walker-Andrews, 1997). They also represent positive and negative emotions in terms of their external causes and internal mental states (e.g., desires). For example, ten-month-olds refer to their parents{\textquoteright} facial expressions (i.e., positive or negative) in response to ambiguous stimuli (e.g., Klinnert, 1984; Moses, et al., 2001; Sorce, et al., 1985; Walden \& Ogan, 1988). They also expect an agent to feel positive rather than negative when she achieves her goal (Skerry \& Spelke, 2014). By eighteen months, toddlers can recover someone{\textquoteright}s likes and dislikes from her vocalizations ({\textquotedblleft}Yummy{\textquotedblright} or {\textquotedblleft}Yucky{\textquotedblright}) together with her emotional response (Repacholi \& Gopnik, 1997). However, little evidence has found that infants and toddlers can make distinctive causal inferences from within-valence emotions. Some researchers have thus proposed that a relatively fine-grained understanding of emotion emerges only gradually after two years old (e.g., Bormann-Kischkel, et al., 1990; Bullock \& Russell, 1984; 1985; 1986; Russell \& Widen, 2002; Widen \& Russell, 2003; 2008; 2010).

In our study, we investigate such fine-grained understanding by looking at young children{\textquoteright}s ability to map diverse within-valence emotional expressions with their probable causes. We started with testing two- to four-year-olds (Experiment 1). Using a forced-choice task, children successfully identified the causes of positive vocal expressions elicited by exciting, delicious, adorable, funny, and sympathetic events (Figure 1; two-year-olds: M=.60, t(15)=2.745, p=.015; three-year-olds: M=.68, t(15)=3.637, p=.002; four-year-olds: M=.90, t(15)=29.589, p\<.001). Using the same materials, similar results obtained in a preferential looking paradigm with 18-23-month-olds (Figure 2; the effect of Time: F(6, 490) = 6.55, p \< .001 for the initial sample and F(6, 527) = 6.96, p \< .001 for the replication). No effect of emotion categories throughout. In Experiment 3, we used a manual search paradigm with 12-17-month-olds. During the experiment, the experimenter looked into a box and made an emotional vocalization (i.e., either {\textquotedblleft}Aww{\textellipsis}{\textquotedblright} indicating something cute, or {\textquotedblleft}Mmm{\textellipsis}{\textquotedblright} indicating something delicious). We found a trend suggesting that 12-17-month-olds searched longer in the box when they found a toy incongruent with the vocalization (i.e., hearing {\textquotedblleft}Aww{\textquotedblright} and finding a banana, or hearing {\textquotedblleft}Mmm{\textquotedblright} and finding a stuffed animal) than when they found one congruent (i.e., hearing {\textquotedblleft}Aww{\textquotedblright} and finding a stuffed animal, or hearing {\textquotedblleft}Mmm{\textquotedblright} and finding a banana; T=18.89, p=.083; permutation test). A pre-registered replication found similar results (T=35.35, p=.053). These results suggest that infants have the emerging ability to discriminate within-valence emotional expressions and infer their probable causes.

}, author = {Wu, Yang and Muentener, Paul and Laura Schulz} } @article {2784, title = {Online learning of symbolic concepts}, year = {2017} } @article {2568, title = {Organization of high-level visual cortex in human infants}, journal = {Nature Communications}, year = {2017}, month = {01/2017}, abstract = {

How much of the structure of the human mind and brain is already specified at birth, and how much arises from experience? In this article, we consider the test case of extrastriate visual cortex, where a highly systematic functional organization is present in virtually every normal adult, including regions preferring behaviourally significant stimulus categories, such as faces, bodies, and scenes. Novel methods were developed to scan awake infants with fMRI, while they viewed multiple categories of visual stimuli. Here we report that the visual cortex of 4{\textendash}6-month-old infants contains regions that respond preferentially to abstract categories (faces and scenes), with a spatial organization similar to adults. However, precise response profiles and patterns of activity across multiple visual categories differ between infants and adults. These results demonstrate that the large-scale organization of category preferences in visual cortex is adult-like within a few months after birth, but is subsequently refined through development.

}, doi = {10.1038/ncomms13995}, url = {http://www.nature.com/doifinder/10.1038/ncomms13995}, author = {Ben Deen and Richardson, Hilary and Dilks, Daniel D. and Takahashi, Atsushi and Keil, Boris and Lawrence Wald and Nancy Kanwisher and Rebecca Saxe} } @article {2943, title = {Oscillations, neural computations and learning during wake and sleep}, journal = {Current Opinion in Neurobiology}, volume = {44C}, year = {2017}, month = {07/2017}, type = {Review}, chapter = {193}, doi = {https://doi.org/10.1016/j.conb.2017.05.009}, author = {Hector Penagos and Varela, Carmen and Matthew A. Wilson} } @article {3450, title = {Perceiving social interactions in the posterior superior temporal sulcus}, journal = {Proceedings of the National Academy of Sciences}, volume = {114}, year = {2017}, month = {10/2017}, abstract = {

Primates are highly attuned not just to social characteristics of individual agents, but also to social interactions between multiple agents. Here we report a neural correlate of the representation of social interactions in the human brain. Specifically, we observe a strong univariate response in the posterior superior temporal sulcus (pSTS) to stimuli depicting social interactions between two agents, compared with (i) pairs of agents not interacting with each other, (ii) physical interactions between inanimate objects, and (iii) individual animate agents pursuing goals and interacting with inanimate objects. We further show that this region contains information about the nature of the social interaction{\textemdash}specifically, whether one agent is helping or hindering the other. This sensitivity to social interactions is strongest in a specific subregion of the pSTS but extends to a lesser extent into nearby regions previously implicated in theory of mind and dynamic face perception. This sensitivity to the presence and nature of social interactions is not easily explainable in terms of low-level visual features, attention, or the animacy, actions, or goals of individual agents. This region may underlie our ability to understand the structure of our social world and navigate within it.

}, issn = {Print ISSN: 0027-8424; Online ISSN: 1091-6490}, doi = { https://doi.org/10.1073/pnas.1714471114 }, url = {http://www.pnas.org/content/early/2017/10/06/1714471114.short}, author = {Leyla Isik and Kami Koldewyn and David Beeler and Nancy Kanwisher} } @proceedings {2537, title = {Physical problem solving: Joint planning with symbolic, geometric, and dynamic constraints}, year = {2017}, month = {07/2017}, abstract = {

In this paper, we present a new task that investigates how peo- ple interact with and make judgments about towers of blocks. In Experiment 1, participants in the lab solved a series of prob- lems in which they had to re-configure three blocks from an initial to a final configuration. We recorded whether they used one hand or two hands to do so.\  In Experiment 2, we asked participants online to judge whether they think the person in the lab used one or two hands.\ \  The results revealed a close correspondence between participants{\textquoteright} actions in the lab,\  and the mental simulations of participants online.\  To explain par- ticipants{\textquoteright} actions and mental simulations, we develop a model that plans over a symbolic representation of the situation, exe- cutes the plan using a geometric solver, and checks the plan{\textquoteright}s feasibility by taking into account the physical constraints of the scene. Our model explains participants{\textquoteright} actions and judgments to a high degree of quantitative accuracy.

}, keywords = {intuitive physics, logic-geometric programming, planning, problem solving, scene understanding}, author = {Ilker Yildirim and Tobias Gerstenberg and Basil Saeed and Marc Toussant and Joshua B. Tenenbaum} } @article {2958, title = {Predicting actions from subtle preparatory movements}, journal = {Cognition}, volume = {168}, year = {2017}, month = {01/2017}, pages = {65 - 75}, abstract = {

To study how people anticipate others{\textquoteright} actions, we designed a competitive reaching task. Subjects faced each other separated by a Plexiglas screen and their finger movements in 3D space were recorded with sensors. The first subject (Attacker) was instructed to touch one of two horizontally arranged targets on the screen. The other subject (Blocker) touched the same target as quickly as possible. Average finger reaction times (fRTs) were fast, much faster than reactions to a dot moving on the screen in the same manner as the Attacker{\textquoteright}s finger. This suggests the presence of subtle preparatory cues in other parts of the Attacker{\textquoteright}s body. We also recorded videos of Attackers{\textquoteright} movements and had Blockers play against unedited videos as well as videos that had all preparatory cues removed by editing out frames before Attacker finger movements started. Blockers{\textquoteright} fRTs in response to the edited videos were significantly slower (\~{}90\ ms). Also, reversing the preparatory movements in the videos tricked the Blockers into choosing the incorrect target at the beginning of their movement. Next, we occluded various body parts of the Attacker and showed that fRTs slow down only when most of the body of the Attacker is occluded. These results indicate that informative cues are widely distributed over the body and Blockers can use any piece from a set of redundant cues for action prediction. Reaction times in each condition remained constant over the duration of the testing sessions indicating a lack of learning during the experiment. These results suggest that during a dynamic two-person interaction, human subjects possess a remarkable and built-in action reading capacity allowing them to predict others{\textquoteright} goals and respond efficiently in this competitive setting.

}, keywords = {Action prediction, Action reading, Biological motion, Competitive interaction, Motor interaction}, issn = {00100277}, doi = {10.1016/j.cognition.2017.06.014}, url = {http://www.sciencedirect.com/science/article/pii/S0010027717301762}, author = {Maryam Vaziri-Pashkam and Sarah Cormiea and Ken Nakayama} } @conference {2587, title = {Predicting Native Language from Gaze}, booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL 2017)}, year = {2017}, author = {Yevgeni Berzak and Chie Nakamura and Suzanne Flynn and Boris Katz} } @article {3283, title = {Pruning Convolutional Neural Networks for Image Instance Retrieval}, year = {2017}, month = {07/2017}, abstract = {

In this work, we focus on the problem of image instance retrieval with deep descriptors extracted from pruned Convolutional Neural Networks (CNN). The objective is to heavily prune convolutional edges while maintaining retrieval performance. To this end, we introduce both data-independent and data-dependent heuristics to prune convolutional edges, and evaluate their performance across various compression rates with different deep descriptors over several benchmark datasets. Further, we present an end-to-end framework to fine-tune the pruned network, with a triplet loss function specially designed for the retrieval task. We show that the combination of heuristic pruning and fine-tuning offers 5x compression rate without considerable loss in retrieval performance.

}, keywords = {CNN, Image Instance Re- trieval, Pooling, Pruning, Triplet Loss}, url = {https://arxiv.org/abs/1707.05455}, author = {Gaurav Manek and Jie Lin and Vijay Chandrasekhar and Lingyu Duan and Sateesh Giduthuri and Xiaoli Li and Tomaso Poggio} } @article {2569, title = {The Quest for the FFA and Where It Led}, journal = {The Journal of Neuroscience}, volume = {37}, year = {2017}, month = {02/2017}, pages = {1056 - 1061}, abstract = {

This article tells the story behind our first paper on the fusiform face area (FFA): how we chose the question, developed the methods, and followed the data to find the FFA and subsequently many other functionally specialized cortical regions. The paper{\textquoteright}s impact had less to do with the particular findings in the paper itself and more to do with the method that it promoted and the picture of the human mind and brain that it led to. The use of a functional localizer to define a candidate region in each subject individually enabled us not just to make pictures of brain activation, but also to ask principled, hypothesis-driven questions about a thing in nature. This method enabled stronger and more extensive tests of the function of each cortical region than had been possible before in humans and, as a result, has produced a large body of evidence that the human cortex contains numerous regions that are specifically engaged in particular mental processes. The growing inventory of cortical regions with distinctive and often very specific functions can be seen as an initial sketch of the basic components of the human mind. This sketch also serves as a roadmap into the vast and exciting new landscape of questions about the computations, structural connections, time course, development, plasticity, and evolution of each of these regions, as well as the hardest question of all: how do these regions work together to produce human intelligence?

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.1706-16.2016}, url = {http://www.jneurosci.org/lookup/doi/10.1523/JNEUROSCI.1706-16.2016}, author = {Nancy Kanwisher} } @article {2763, title = {Rational quantitative attribution of beliefs, desires, and percepts in human mentalizing}, journal = {Nature Human Behavior}, volume = {1}, year = {2017}, month = {03/2017}, abstract = {

Social cognition depends on our capacity for {\textquoteleft}mentalizing{\textquoteright}, or explaining an agent{\textquoteright}s behaviour in terms of their mental states. The development and neural substrates of mentalizing are well-studied, but its computational basis is only beginning to be probed. Here we present a model of core mentalizing computations: inferring jointly an actor{\textquoteright}s beliefs, desires and percepts from how they move in the local spatial environment. Our Bayesian theory of mind (BToM) model is based on probabilistically inverting artificial-intelligence approaches to rational planning and state estimation, which extend classical expected-utility agent models to sequential actions in complex, partially observable domains. The model accurately captures the quantitative mental-state judgements of human participants in two experiments, each varying multiple stimulus dimensions across a large number of stimuli. Comparative model fits with both simpler {\textquoteleft}lesioned{\textquoteright} BToM models and a family of simpler non-mentalistic motion features reveal the value contributed by each component of our model.

}, keywords = {Human behaviour, Social behaviour}, doi = {doi:10.1038/s41562-017-0064}, url = {http://www.nature.com/articles/s41562-017-0064}, author = {Chris Baker and Julian Jara-Ettinger and Rebecca Saxe and Joshua B. Tenenbaum} } @inbook {2126, title = {Recognition of occluded objects}, booktitle = {Computational and Cognitive Neuroscience of Vision}, year = {2017}, publisher = {Springer Singapore}, organization = {Springer Singapore}, issn = {978-981-10-0211-3}, url = {http://www.springer.com/us/book/9789811002113}, author = {Hanlin Tang and Gabriel Kreiman and Qi Zhao} } @article {2525, title = {Reinforcement learning and episodic memory in humans and animals: an integrative framework}, journal = {Annual Review of Psychology}, volume = {68}, year = {2017}, chapter = {101}, author = {Samuel J Gershman and Nathaniel D Daw} } @conference {2673, title = {Representation Learning from Orbit Sets for One-shot Classification}, booktitle = {AAAI Spring Symposium Series, Science of Intelligence}, year = {2017}, address = {AAAI}, abstract = {

The sample complexity of a learning task is increased by transformations that do not change class identity. Visual object recognition for example, i.e. the discrimination or categorization of distinct semantic classes, is affected by changes in viewpoint, scale, illumination or planar transformations. We introduce a weakly-supervised framework for learning robust and selective representations from sets of transforming examples (orbit sets). We train deep encoders that explicitly account for the equivalence up to transformations of orbit sets and show that the resulting encodings contract the intra-orbit distance and preserve identity either by preserving reconstruction or by increasing the inter-orbit distance. We explore a loss function that combines a discriminative term, and a reconstruction term that uses a decoder-encoder map to learn to rectify transformation-perturbed examples, and demonstrate the validity of the resulting embeddings for one-shot learning. Our results suggest that a suitable definition of orbit sets is a form of weak supervision that can be exploited to learn semantically relevant embeddings.

}, url = {https://www.aaai.org/ocs/index.php/SSS/SSS17/paper/view/15357}, author = {Andrea Tacchetti and Stephen Voinea and Georgios Evangelopoulos and Tomaso Poggio} } @article {2681, title = {On the Robustness of Convolutional Neural Networks to Internal Architecture and Weight Perturbations}, year = {2017}, month = {03/2017}, abstract = {

Deep convolutional neural networks are generally regarded as robust function approximators. So far, this intuition is based on perturbations to external stimuli such as the images to be classified. Here we explore the robustness of convolutional neural networks to perturbations to the internal weights and architecture of the network itself. We show that convolutional networks are surprisingly robust to a number of internal perturbations in the higher convolutional layers but the bottom convolutional layers are much more fragile. For instance, Alexnet shows less than a 30\% decrease in classification performance when randomly removing over 70\% of weight connections in the top convolutional or dense layers but performance is almost at chance with the same perturbation in the first convolutional layer. Finally, we suggest further investigations which could continue to inform the robustness of convolutional networks to internal perturbations.

}, author = {Nicholas Cheney and Martin Schrimpf and Gabriel Kreiman} } @article {3394, title = {Seeing a straight line on a curved surface: decoupling of patterns from surfaces by single IT neurons}, journal = {Journal of Neurophysiology}, volume = {11773}, year = {2017}, month = {Jan-01-2017}, pages = {104 - 116}, issn = {0022-3077}, doi = {10.1152/jn.00551.2016}, url = {http://www.physiology.org/doi/10.1152/jn.00551.2016http://www.physiology.org/doi/pdf/10.1152/jn.00551.2016}, author = {N. Apurva Ratan Murty and Arun, S. P.} } @article {3039, title = {Seeing faces is necessary for face-domain formation}, journal = {Nature Neuroscience}, volume = {5631628}, year = {2017}, month = {09/2017}, abstract = {

Here we report that monkeys raised without exposure to faces did not develop face domains, but did develop domains for other categories and did show normal retinotopic organization, indicating that early face deprivation leads to a highly selective cortical processing deficit. Therefore, experience must be necessary for the formation (or maintenance) of face domains. Gaze tracking revealed that control monkeys looked preferentially at faces, even at ages prior to the emergence of face domains, but face-deprived monkeys did not, indicating that face looking is not innate. A retinotopic organization is present throughout the visual system at birth, so selective early viewing behavior could bias category-specific visual responses toward particular retinotopic representations, thereby leading to domain formation in stereotyped locations in inferotemporal cortex, without requiring category-specific templates or biases. Thus, we propose that environmental importance influences viewing behavior, viewing behavior drives neuronal activity, and neuronal activity sculpts domain formation.

}, issn = {1097-6256}, doi = {10.1038/nn.4635}, url = {http://www.nature.com/doifinder/10.1038/nn.4635}, author = {Michael J Arcaro and Peter F Schade and Vincent, Justin L and Carlos R Ponce and Margaret S Livingstone} } @conference {3596, title = {Self-supervised intrinsic image decomposition.}, booktitle = { Annual Conference on Neural Information Processing Systems (NIPS)}, year = {2017}, month = {12/2017}, address = {Long Beach, CA}, url = {https://papers.nips.cc/paper/7175-self-supervised-intrinsic-image-decomposition}, author = {Michael Janner and Jiajun Wu and Tejas Kulkarni and Ilker Yildirim and Joshua B. Tenenbaum} } @proceedings {3242, title = {Shape and Material from Sound}, year = {2017}, month = {12/2017}, pages = {1278{\textendash}1288}, address = {Long Beach, CA}, abstract = {

What can we infer from hearing an object falling onto the ground? Based on knowledge of the physical world, humans are able to infer rich information from such limited data: rough shape of the object, its material, the height of falling, etc. In this paper, we aim to approximate such competency. We first mimic the human knowledge about the physical world using a fast physics-based generative model. Then, we present an analysis-by-synthesis approach to infer properties of the falling object. We further approximate human past experience by directly mapping audio to object properties using deep learning with self-supervision. We evaluate our method through behavioral studies, where we compare human predictions with ours on inferring object shape, material, and initial height of falling. Results show that our method achieves near-human performance, without any annotations.

}, url = {http://papers.nips.cc/paper/6727-shape-and-material-from-sound.pdf}, author = {zhang, zhoutong and Qiujia Li and Zhengjia Huang and Jiajun Wu and Joshua B. Tenenbaum and William T. Freeman}, editor = {I. Guyon and U. V. Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett} } @article {2358, title = {Six-month-old infants expect agents to minimize the cost of their actions}, journal = {Cognition}, volume = {160}, year = {2017}, month = {03/2017}, pages = {35-42}, abstract = {

Substantial evidence indicates that infants expect agents to move directly to their goals when no obstacles block their paths, but the representations that articulate this expectation and its robustness have not been characterized. Across three experiments (total N = 60), 6-month-old infants responded to a novel, curvilinear action trajectory on the basis of its efficiency, in accord with the expectation that an agent will move to its goal on the least costly path that the environment affords. Infants expected minimally costly action when presented with a novel constraint, and extended this expectation to agents who had previously acted inefficiently. Infants{\textquoteright} understanding of goal-directed action cannot be explained alone by sen- sitivity to specific features of agent{\textquoteright}s actions (e.g. agents tend to move on straight paths, along supporting surfaces, when facing their goals directly) or extrapolations of agents{\textquoteright} past actions to their future ones (e.g. if an agent took the shortest path to an object in the past, it will continue to do so in the future). Instead, infants{\textquoteright} reasoning about efficiency accords with the overhypothesis that agents minimize the cost of their actions.

}, keywords = {cognitive development, goal inference, open data, open materials, social cognition}, issn = {00100277}, doi = {10.1016/j.cognition.2016.12.007}, url = {http://www.sciencedirect.com/science/article/pii/S001002771630302X}, author = {Shari Liu and Elizabeth S Spelke} } @article {3523, title = {Size-Independent Sample Complexity of Neural Networks}, year = {2017}, author = {N. Golowich and A. Rakhlin and O. Shamir} } @conference {2603, title = {Spatial cognition across development}, booktitle = {SRCD}, year = {2017}, address = {Austin, TX}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {4067, title = {Sustained Activity Encoding Working Memories: Not Fully Distributed}, journal = {Trends in Neurosciences }, volume = {40 }, year = {2017}, month = {June 2017}, pages = {328-346}, abstract = {

Working memory (WM) is the ability to remember and manipulate information for short time intervals. Recent studies have proposed that sustained firing encoding the contents of WM is ubiquitous across cortical neurons. We review here the collective evidence supporting this claim. A variety of studies report that neurons in prefrontal, parietal, and inferotemporal association cortices show robust sustained activity encoding the location and features of memoranda during WM tasks. However, reports of WM-related sustained activity in early sensory areas are rare, and typically lack stimulus specificity. We propose that robust sustained activity that can support WM coding arises as a property of association cortices downstream from the early stages of sensory processing.

}, keywords = {working memory sustained activity neurophysiology fMRI primate review}, doi = {https://doi.org/10.1016/j.tins.2017.04.004}, url = {https://www.sciencedirect.com/science/article/pii/S0166223617300711}, author = {Leavitt, M.L. and Diego Mendoza-Halliday and Martinez-Trujillo J.C.} } @article {2900, title = {Symmetry Regularization}, number = {063}, year = {2017}, month = {05/2017}, abstract = {

The properties of a representation, such as smoothness, adaptability, generality, equivari- ance/invariance, depend on restrictions imposed during learning. In this paper, we propose using data symmetries, in the sense of equivalences under transformations, as a means for learning symmetry- adapted representations, i.e., representations that are equivariant to transformations in the original space. We provide a sufficient condition to enforce the representation, for example the weights of a neural network layer or the atoms of a dictionary, to have a group structure and specifically the group structure in an unlabeled training set. By reducing the analysis of generic group symmetries to per- mutation symmetries, we devise an analytic expression for a regularization scheme and a permutation invariant metric on the representation space. Our work provides a proof of concept on why and how to learn equivariant representations, without explicit knowledge of the underlying symmetries in the data.

}, author = {F. Anselmi and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {4235, title = {Synthesizing 3D Shapes via Modeling Multi-view Depth Maps and Silhouettes with Deep Generative Networks}, booktitle = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, year = {2017}, month = {07/2017}, address = {Honolulu, HI}, abstract = {

We study the problem of learning generative models of 3D shapes. Voxels or 3D parts have been widely used as the underlying representations to build complex 3D shapes; however, voxel-based representations suffer from high memory requirements, and parts-based models require a large collection of cached or richly parametrized parts. We take an alternative approach: learning a generative model over multi-view depth maps or their corresponding silhouettes, and using a deterministic rendering function to produce 3D shapes from these images. A multi-view representation of shapes enables generation of 3D models with fine details, as 2D depth maps and silhouettes can be modeled at a much higher resolution than 3D voxels. Moreover, our approach naturally brings the ability to recover the underlying 3D representation from depth maps of one or a few viewpoints. Experiments show that our framework can generate 3D shapes with variations and details. We also demonstrate that our model has out-of-sample generalization power for real-world tasks with occluded objects.

}, keywords = {2d to 3d, 3D generation, 3D reconstruction, Core object system, depth map, generative, perception, silhouette}, doi = {10.1109/CVPR.2017.269}, url = {http://ieeexplore.ieee.org/document/8099752/http://xplorestaging.ieee.org/ielx7/8097368/8099483/08099752.pdf?arnumber=8099752}, author = {Amir Arsalan Soltani and Haibin Huang and Jiajun Wu and Tejas Kulkarni and Joshua B. Tenenbaum} } @conference {3492, title = {Temporal Grounding Graphs for Language Understanding with Accrued Visual-Linguistic Context}, booktitle = {Proceedings of the Twenty-Sixth International Joint Conference on Artificial Intelligence (IJCAI 2017)}, year = {2017}, month = {08/2017}, address = {Melbourne, Australia}, abstract = {

A robot{\textquoteright}s ability to understand or ground natural language instructions is fundamentally tied to its knowledge about the surrounding world. We present an approach to grounding natural language utter- ances in the context of factual information gathered through natural-language interactions and past vi- sual observations. A probabilistic model estimates, from a natural language utterance, the objects, re- lations, and actions that the utterance refers to, the objectives for future robotic actions it implies, and generates a plan to execute those actions while up- dating a state representation to include newly ac- quired knowledge from the visual-linguistic context. Grounding a command necessitates a representa- tion for past observations and interactions; however, maintaining the full context consisting of all pos- sible observed objects, attributes, spatial relations, actions, etc., over time is intractable. Instead, our model, Temporal Grounding Graphs , maintains a learned state representation for a belief over factual groundings, those derived from natural-language in- teractions, and lazily infers new groundings from visual observations using the context implied by the utterance. This work significantly expands the range of language that a robot can understand by incor- porating factual knowledge and observations of its workspace into its inference about the meaning and grounding of natural-language utterances.

}, url = {c}, author = {Rohan Paul and Andrei Barbu and Sue Felshin and Boris Katz and Nicholas Roy} } @article {3485, title = {Ten-month-old infants infer the value of goals from the costs of actions}, journal = {Science}, volume = {358}, year = {2017}, month = {11/2017}, pages = {1038-1041}, chapter = {1038}, abstract = {

Infants understand that people pursue goals, but how do they learn which goals people prefer? We tested whether infants solve this problem by inverting a mental model of action planning, trading off the costs of acting against the rewards actions bring. After seeing an agent attain two goals equally often at varying costs, infants expected the agent to prefer the goal it attained through costlier actions. These expectations held across three experiments that conveyed cost through different physical path features (height, width, and incline angle), suggesting that an abstract variable{\textemdash}such as {\textquotedblleft}force,{\textquotedblright} {\textquotedblleft}work,{\textquotedblright} or {\textquotedblleft}effort{\textquotedblright}{\textemdash}supported infants{\textquoteright} inferences. We modeled infants{\textquoteright} expectations as Bayesian inferences over utility-theoretic calculations, providing a bridge to recent quantitative accounts of action understanding in older children and adults.

}, author = {Shari Liu and Ullman, Tomer D. and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2636, title = {Ten-month-old infants infer value from effort}, year = {2017}, author = {Shari Liu and Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2604, title = {Ten-month-old infants infer value from effort}, year = {2017}, address = {Austin, TX}, author = {Shari Liu and Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {3499, title = {Thalamic contribution to CA1-mPFC interactions during sleep}, number = {Program$\#$/Poster$\#$: 799.13/TT8}, year = {2017}, address = {Washington, DC}, abstract = {

The consolidation of episodic memories is thought to require precisely timed interactions between cells in the hippocampus and neocortex during sleep, but the specific mechanisms by which this dialogue unfolds are poorly understood. During sleep, activity in the hippocampus and neocortex is temporally structured by a slow oscillation (1-4Hz) that frames the occurrence of faster oscillations: spindles (7-14Hz) in neocortex, and ripples (150-200Hz) in hippocampus. The observation of spindles suggests the participation of the thalamus, but its contribution has remained an open question. I will present results from simultaneous extracellular recordings of single units and local field potentials in the midline thalamus, mPFC and CA1 in freely behaving rats.

We find that both CA1 ripples and unit firing in the midline thalamus are coordinated with the neocortical slow oscillation. Interestingly, while hippocampal ripples are more likely to occur in 250ms windows before and after neocortical K-complexes (KCs, which mark the downstate of the slow oscillation), spiking probability in a subset of thalamic units is asymmetric and increases following neocortical KCs. Of the units recorded in midline thalamus simultaneously with CA1 and mPFC (n=29), 20.7\% showed a significant increase in firing rate (\>2 standard deviations from baseline) following mPFC KCs. This finding suggests that the time following KCs (the start of the slow oscillation) is functionally different from the end of the oscillation (before KCs), and includes an increased contribution from cells in the midline thalamus, which could influence neocortical populations in preparation for the reactivation of hippocampal memory traces. Furthermore, the correlation between KCs and thalamic units can be modulated by CA1 ripples, suggesting that combined {\textquoteleft}neocortical KC+CA1 ripple{\textquoteright} events can reveal subtle interactions between the three regions. Lastly, units in the reuniens and the ventromedial nuclei show a broad decrease in spiking probability around the time of hippocampal ripples. 57.6\% of units in these nuclei present a significant drop in firing rate compared to 20.8\% in cells recorded in other midline nuclei (p\< 0.05; n=57 units). This suggests that certain thalamic nuclei may be key for gating the transfer of memory information from the hippocampus to neocortex, by opening a time window in which ripples may be more likely to occur.

These results provide the first evidence of the involvement of midline thalamic cells in neocortico-hippocampal interactions during sleep, and point to specific mechanisms by which multi-region brain interactions may contribute to the systems consolidation of memories.

}, author = {Varela, Carmen and Matthew A. Wilson} } @article {2804, title = {Theoretical principles of multiscale spatiotemporal control of neuronal networks: a complex systems perspective}, year = {2017}, abstract = {

Success in the fine control of the nervous system depends on a deeper understanding of how neural circuits control behavior. There is, however, a wide gap between the components of neural circuits and behavior. We advance the idea that a suitable approach for narrowing this gap has to be based on a multiscale information-theoretic description of the system. We evaluate the possibility that brain-wide complex neural computations can be dissected into a hierarchy of computational motifs that rely on smaller circuit modules interacting at multiple scales. In doing so, we draw attention to the importance of formalizing the goals of stimulation in terms of neural computations so that the possible implementations are matched in scale to the underlying circuit modules.

}, doi = {10.1101/097618}, url = {http://biorxiv.org/content/early/2017/01/01/097618.full.pdf+html}, author = {Nima Dehghani} } @article {2698, title = {Theory II: Landscape of the Empirical Risk in Deep Learning}, year = {2017}, month = {03/2017}, abstract = {

Previous theoretical work on deep learning and neural network optimization tend to focus on avoiding saddle points and local minima. However, the practical observation is that, at least for the most successful Deep Convolutional Neural Networks (DCNNs) for visual processing, practitioners can always increase the network size to fit the training data (an extreme example would be [1]). The most successful DCNNs such as VGG and ResNets are best used with a small degree of "overparametrization". In this work, we characterize with a mix of theory and experiments, the landscape of the empirical risk of overparametrized DCNNs. We first prove the existence of a large number of degenerate global minimizers with zero empirical error (modulo inconsistent equations). The zero-minimizers -- in the case of classification -- have a non-zero margin. The same minimizers are degenerate and thus very likely to be found by SGD that will furthermore select with higher probability the zero-minimizer with larger margin, as discussed in Theory III (to be released). We further experimentally explored and visualized the landscape of empirical risk of a DCNN on CIFAR-10 during the entire training process and especially the global minima. Finally, based on our theoretical and experimental results, we propose an intuitive model of the landscape of DCNN{\textquoteright}s empirical loss surface, which might not be as complicated as people commonly believe.

}, author = {Tomaso Poggio and Qianli Liao} } @article {3261, title = {Theory of Deep Learning IIb: Optimization Properties of SGD}, year = {2017}, month = {12/2017}, abstract = {

In Theory IIb we characterize with a mix of theory and experiments the optimization of deep convolutional networks by Stochastic Gradient Descent. The main new result in this paper is theoretical and experimental evidence for the following conjecture about SGD: SGD concentrates in probability - like the classical Langevin equation {\textendash} on large volume, {\textquotedblleft}flat{\textquotedblright} minima, selecting flat minimizers which are with very high probability also global minimizers.

}, author = {Chiyuan Zhang and Qianli Liao and Alexander Rakhlin and Brando Miranda and Noah Golowich and Tomaso Poggio} } @article {3266, title = {Theory of Deep Learning III: explaining the non-overfitting puzzle}, year = {2017}, month = {12/2017}, abstract = {

THIS MEMO IS REPLACED BY CBMM MEMO 90

A main puzzle of deep networks revolves around the absence of overfitting despite overparametrization and despite the large capacity demonstrated by zero training error on randomly labeled data. In this note, we show that the dynamical systems associated with gradient descent minimization of nonlinear networks behave near zero stable minima of the empirical error as gradient system in a quadratic potential with degenerate Hessian. The proposition is supported by theoretical and numerical results, under the assumption of stable minima of the gradient.

Our proposition provides the extension to deep networks of key properties of gradient descent methods for linear networks, that as, suggested in (1), can be the key to understand generalization. Gradient descent enforces a form of implicit regular- ization controlled by the number of iterations, and asymptotically converging to the minimum norm solution. This implies that there is usually an optimum early stopping that avoids overfitting of the loss (this is relevant mainly for regression). For classification, the asymptotic convergence to the minimum norm solution implies convergence to the maximum margin solution which guarantees good classification error for {\textquotedblleft}low noise{\textquotedblright} datasets.

The implied robustness to overparametrization has suggestive implications for the robustness of deep hierarchically local networks to variations of the architecture with respect to the curse of dimensionality.

}, author = {Tomaso Poggio and Keji Kawaguchi and Qianli Liao and Brando Miranda and Lorenzo Rosasco and Xavier Boix and Jack Hidary and Hrushikesh Mhaskar} } @article {3292, title = {Theory of Intelligence with Forgetting: Mathematical Theorems Explaining Human Universal Forgetting using {\textquotedblleft}Forgetting Neural Networks{\textquotedblright}}, year = {2017}, month = {12/2017}, abstract = {

In [42] we suggested that any memory stored in the human/animal brain is forgotten following the Ebingghaus curve {\textendash} in this follow-on paper, we define a novel algebraic structure, a Forgetting Neural Network, as a simple mathematical model based on assuming parameters of a neuron in a neural network are forgotten using the Ebbinghaus forgetting curve. We model neural networks in Sobolev spaces using [35] as our departure point and demonstrate four novel theorems of Forgetting Neural Networks: theorem of non-instantaneous forgetting, theorem of universal forgetting, curse of forgetting theorem, and center of mass theorem. We also proof the novel decreasing inference theorem which we feel is relevant beyond Ebbinghaus forgetting: compositional deep neural networks cannot arbitrarily combine low level {\textquotedblleft}features{\textquotedblright} {\textendash} meaning only certain arrangements of features calculated in intermediate levels can show up in higher levels. This proof leads us to present the possibly most efficient representation of neural networks{\textquoteright} {\textquotedblleft}minimal polynomial basis layer{\textquotedblright} (MPBL) since our basis construct can generate n polynomials of order m using only 2m + 1 + n neurons. As we briefly discuss in the conclusion, there are about 10 similarities between forgetting neural networks and human forgetting and our research elicits more questions than it answers and may have implications for neuroscience research including our understanding of how babies learn (or, perhaps, forget), including what we call the baby forgetting conjecture.

}, author = {Felipe Cano-Córdoba and Sanjay Sarma and Brian Subirana} } @article {2646, title = {Thinking fast or slow? A reinforcement-learning approach}, year = {2017}, address = {San Antonio, TX}, author = {Kool, W and Samuel J Gershman and Fiery A Cushman} } @proceedings {2762, title = {Tunable Efficient Unitary Neural Networks (EUNN) and their application to RNN}, volume = {70}, year = {2017}, month = {08/2017}, pages = {1733-1741 }, url = {https://arxiv.org/abs/1612.05231}, author = {Jing, Li and Shen, Yichen and Dub{\v c}ek, Tena and Peurifoi, John and Skirlo, Scott and LeCun, Yann and Max Tegmark and Solja{\v c}i{\'c}, Marin} } @article {3935, title = {Two areas for familiar face recognition in the primate brain}, journal = {Science}, volume = {357}, year = {2017}, month = {08/2017}, pages = {591 - 595}, chapter = {591}, abstract = {

Familiarity alters face recognition: Familiar faces are recognized more accurately than unfamiliar ones and under difficult viewing conditions when unfamiliar face recognition fails. The neural basis for this fundamental difference remains unknown. Using whole-brain functional magnetic resonance imaging, we found that personally familiar faces engage the macaque face-processing network more than unfamiliar faces. Familiar faces also recruited two hitherto unknown face areas at anatomically conserved locations within the perirhinal cortex and the temporal pole. These two areas, but not the core face-processing network, responded to familiar faces emerging from a blur with a characteristic nonlinear surge, akin to the abruptness of familiar face recognition. In contrast, responses to unfamiliar faces and objects remained linear. Thus, two temporal lobe areas extend the core face-processing network into a familiar face-recognition system.

}, issn = {0036-8075}, doi = {10.1126/science.aan1139}, url = {http://www.sciencemag.org/lookup/doi/10.1126/science.aan1139}, author = {Landi, Sofia M. and W. A. Freiwald} } @article {2327, title = {View-Tolerant Face Recognition and Hebbian Learning Imply Mirror-Symmetric Neural Tuning to Head Orientation}, journal = {Current Biology}, volume = {27}, year = {2017}, month = {01/2017}, pages = {1-6}, abstract = {

The primate brain contains a hierarchy of visual areas, dubbed the ventral stream, which rapidly computes object representations that are both specific for object identity and robust against identity-preserving transformations, like depth rotations. Current computational models of object recognition, including recent deep-learning networks, generate these properties through a hierarchy of alternating selectivity-increasing filtering and tolerance-increasing pooling operations, similar to simple-complex cells operations. Here, we prove that a class of hierarchical architectures and a broad set of biologically plausible learning rules generate approximate invariance to identity-preserving transformations at the top level of the processing hierarchy. However, all past models tested failed to reproduce the most salient property of an intermediate representation of a three-level face-processing hierarchy in the brain: mirror-symmetric tuning to head orientation. Here, we demonstrate that one specific biologically plausible Hebb-type learning rule generates mirror-symmetric tuning to bilaterally symmetric stimuli, like faces, at intermediate levels of the architecture and show why it does so. Thus, the tuning properties of individual cells inside the visual stream appear to result from group properties of the stimuli they encode and to reflect the learning rules that sculpted the information-processing system within which they reside.\ 

}, doi = {http://dx.doi.org/10.1016/j.cub.2016.10.015}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and W. A. Freiwald and Tomaso Poggio} } @article {3451, title = {What is changing when: Decoding visual information in movies from human intracranial recordings}, journal = {Neuroimage}, year = {2017}, abstract = {

The majority of visual recognition studies have focused on the neural responses to repeated presentations of static stimuli with abrupt and well-defined onset and offset times. In contrast,\ natural vision\ involves unique renderings of visual inputs that are continuously changing without explicitly defined temporal transitions. Here we considered commercial movies as a coarse proxy to natural vision. We recorded intracranial\ field potential\ signals from 1,284 electrodes implanted in 15 patients with\ epilepsy\ while the subjects passively viewed commercial movies. We could rapidly detect large changes in the visual inputs within approximately 100\ ms of their occurrence, using exclusively field potential signals from ventral visual cortical areas including the inferior temporal\ gyrus\ and inferior\ occipitalgyrus. Furthermore, we could decode the content of those visual changes even in a single movie presentation, generalizing across the wide range of transformations present in a movie. These results present a methodological framework for studying cognition during dynamic and natural vision.

}, doi = {https://doi.org/10.1016/j.neuroimage.2017.08.027}, url = {https://www.sciencedirect.com/science/article/pii/S1053811917306742}, author = {Leyla Isik and Jedediah Singer and Joseph Madsen and Nancy Kanwisher and Gabriel Kreiman} } @proceedings {2682, title = {When and Why Are Deep Networks Better Than Shallow Ones?}, year = {2017}, abstract = {
While the universal approximation property holds both for hierarchical and shallow networks, deep networks can approximate the class of compositional functions as well as shallow networks but with exponentially lower number of training parameters and sample complexity. Compositional functions are obtained as a hierarchy of local constituent functions, where "local functions{\textquoteright}{\textquoteright} are functions with low dimensionality. This theorem proves an old conjecture by Bengio on the role of depth in networks, characterizing precisely the conditions under which it holds. It also suggests possible answers to the the puzzle of why high-dimensional deep networks trained on large training sets often do not seem to show overfit.
}, author = {Hrushikesh Mhaskar and Qianli Liao and Tomaso Poggio} } @article {2557, title = {Why and when can deep-but not shallow-networks avoid the curse of dimensionality: A review}, journal = {International Journal of Automation and Computing}, year = {2017}, month = {03/2017}, pages = {1-17}, abstract = {

The paper reviews and extends an emerging body of theoretical results on deep learning including the conditions under which it can be exponentially better than shallow learning. A class of deep convolutional networks represent an important special case of these conditions, though weight sharing is not the main reason for their exponential advantage. Implications of a few key theorems are discussed, together with new results, open problems and conjectures.

}, keywords = {convolutional neural networks, deep and shallow networks, deep learning, function approximation, Machine Learning, Neural Networks}, doi = {10.1007/s11633-017-1054-2}, url = {http://link.springer.com/article/10.1007/s11633-017-1054-2?wt_mc=Internal.Event.1.SEM.ArticleAuthorOnlineFirst}, author = {Tomaso Poggio and Hrushikesh Mhaskar and Lorenzo Rosasco and Brando Miranda and Qianli Liao} } @article {2761, title = {Why does deep and cheap learning work so well?}, journal = {Journal of Statistical Physics}, volume = {168}, year = {2017}, month = {09/2017}, pages = {1223{\textendash}1247}, chapter = {1223}, abstract = {

We show how the success of deep learning could depend not only on mathematics but also on physics: although well-known mathematical theorems guarantee that neural networks can approximate arbitrary functions well, the class of functions of practical interest can frequently be approximated through {\textquotedblleft}cheap learning{\textquotedblright} with exponentially fewer parameters than generic ones. We explore how properties frequently encountered in physics such as symmetry, locality, compositionality, and polynomial log-probability translate into exceptionally simple neural networks. We further argue that when the statistical process generating the data is of a certain hierarchical form prevalent in physics and machine learning, a deep neural network can be more efficient than a shallow one. We formalize these claims using information theory and discuss the relation to the renormalization group. We prove various {\textquotedblleft}no-flattening theorems{\textquotedblright} showing when efficient linear deep networks cannot be accurately approximated by shallow ones without efficiency loss; for example, we show that n variables cannot be multiplied using fewer than 2n neurons in a single hidden layer.

}, keywords = {Artificial neural networks, deep learning, Statistical physics}, doi = {10.1007/s10955-017-1836-5}, url = {https://link.springer.com/article/10.1007/s10955-017-1836-5}, author = {Henry Lin and Max Tegmark} } @article {2606, title = {Young children{\textquoteright}s use of distance and angle information during map reading}, year = {2017}, address = {Austin, TX}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {2214, title = {Anchoring and Agreement in Syntactic Annotations}, year = {2016}, month = {09/2016}, abstract = {

Published in the Proceedings of EMNLP 2016

We present a study on two key characteristics of human syntactic annotations: anchoring and agreement. Anchoring is a well-known cognitive bias in human decision making, where judgments are drawn towards preexisting values. We study the influence of anchoring on a standard approach to creation of syntactic resources where syntactic annotations are obtained via human editing of tagger and parser output. Our experiments demonstrate a clear anchoring effect and reveal unwanted consequences, including overestimation of parsing performance and lower quality of annotations in comparison with human-based annotations. Using sentences from the Penn Treebank WSJ, we also report systematically obtained inter-annotator agreement estimates for English dependency parsing. Our agreement results control for parser bias, and are consequential in that they are on par with state of the art parsing performance for English newswire. We discuss the impact of our findings on strategies for future annotation efforts and parser evaluations.

}, author = {Yevgeni Berzak and Yan Huang and Andrei Barbu and Anna Korhonen and Boris Katz} } @article {1760, title = {Atoms of recognition in human and computer vision}, journal = {PNAS }, volume = {113}, year = {2016}, month = {03/2016}, pages = {2744{\textendash}2749}, abstract = {
Discovering the visual features and representations used by thebrain to recognize objects is a central problem in the study of vision. Recently, neural network models of visual object recognition, including biological and deep network models, have shown remarkableprogress and have begun to rival human performance in some challenging tasks. These models are trained on image examples andlearn to extract features and representations and to use them for categorization. It remains unclear, however, whether the representations and learning processes discovered by current models aresimilar to those used by the human visual system. Here we show,by introducing and using minimal recognizable images, that thehuman visual system uses features and processes that are not usedby current models and that are critical for recognition. We found bypsychophysical studies that at the level of minimal recognizableimages a minute change in the image can have a drastic effect onrecognition, thus identifying features that are critical for the task.Simulations then showed that current models cannot explain thissensitivity to precise feature configurations and, more generally,do not learn to recognize minimal images at a human level. The roleof the features shown here is revealed uniquely at the minimal level, where the contribution of each feature is essential. A full understanding of the learning and use of such features will extend ourunderstanding of visual recognition and its cortical mechanisms andwill enhance the capacity of computational models to learn fromvisual experience and to deal with recognition and detailedimage interpretation.
}, keywords = {Computer vision, minimal images, object recognition, visual perception, visual representations}, issn = {1091-6490}, doi = {10.1073/pnas.1513198113}, url = {http://www.pnas.org/content/113/10/2744.abstract}, author = {Shimon Ullman and Liav Assif and Eitan Fetaya and Daniel Harari} } @article {1861, title = {A Bayesian nonparametric approach for uncovering rat hippocampal population codes during spatial navigation}, journal = {Journal of Neuroscience Methods}, volume = {263}, year = {2016}, type = {Computational Neuroscience}, chapter = {36}, abstract = {

Rodent hippocampal population codes represent important spatial information about the environment during navigation. Several computational methods have been developed to uncover the neural representation of spatial topology embedded in rodent hippocampal ensemble spike activity. Here we extend our previous work and propose a nonparametric Bayesian approach to infer rat hippocampal population codes during spatial navigation. To tackle the model selection problem, we leverage a nonparametric Bayesian model. Specifically, to analyze rat hippocampal ensemble spiking activity, we apply a hierarchical Dirichlet process-hidden Markov model (HDP-HMM) using two Bayesian inference methods, one based on Markov chain Monte Carlo (MCMC) and the other based on variational Bayes (VB). We demonstrate the effectiveness of our Bayesian approaches on recordings from a freely-behaving rat navigating in an open field environment. We find that MCMC-based inference with Hamiltonian Monte Carlo (HMC) hyperparameter sampling is flexible and efficient, and outperforms VB and MCMC approaches with hyperparameters set by empirical Bayes.

}, author = {Scott W. Linderman and Matthew J. Johnson and Matthew A. Wilson and Zhe Chen} } @conference {2771, title = {Bayesian nonparametric methods for discovering latent structures of rat hippocampal ensemble spikes}, booktitle = {IEEE Workshop on Machine Learning for Signal Processing}, year = {2016}, month = {09/2016}, address = {Salerno, Italy}, author = {Zhe Chen and Scott W. Linderman and Matthew A. Wilson} } @article {2125, title = {Bottom-up and Top-down Input Augment the Variability of Cortical Neurons.}, journal = {Neuron}, volume = {91(3)}, year = {2016}, pages = {540-547}, author = {Camille G{\'o}mez-Laberge and Alexandra Smolyanskaya and Jonathan J. Nassi and Gabriel Kreiman and Richard T Born} } @article {2034, title = {Bridging the Gaps Between Residual Learning, Recurrent Neural Networks and Visual Cortex}, year = {2016}, month = {04/2016}, abstract = {

We discuss relations between Residual Networks (ResNet), Recurrent Neural Networks (RNNs) and the primate visual cortex. We begin with the observation that a shallow RNN is exactly equivalent to a very deep ResNet with weight sharing among the layers. A direct implementation of such a RNN, although having orders of magnitude fewer parameters, leads to a performance similar to the corresponding ResNet. We propose 1) a generalization of both RNN and ResNet architectures and 2) the conjecture that a class of moderately deep RNNs is a biologically-plausible model of the ventral stream in visual cortex. We demonstrate the effectiveness of the architectures by testing them on the CIFAR-10 dataset.

}, author = {Qianli Liao and Tomaso Poggio} } @article {1984, title = {Building machines that learn and think like people}, year = {2016}, month = {04/2016}, abstract = {

Recent progress in artificial intelligence (AI) has renewed interest in building systems that learn and think like people. Many advances have come from using deep neural networks trained end-to-end in tasks such as object recognition, video games, and board games, achieving performance that equals or even beats humans in some respects. Despite their biological inspiration and performance achievements, these systems differ from human intelligence in crucial ways. We review progress in cognitive science suggesting that truly human-like learning and thinking machines will have to reach beyond current engineering trends in both what they learn, and how they learn it. Specifically, we argue that these machines should (a) build causal models of the world that support explanation and understanding, rather than merely solving pattern recognition problems; (b) ground learning in intuitive theories of physics and psychology, to support and enrich the knowledge that is learned; and (c) harness compositionality and learning-to-learn to rapidly acquire and generalize knowledge to new tasks and situations. We suggest concrete challenges and promising routes towards these goals that can combine the strengths of recent neural network advances with more structured cognitive models.

}, author = {Brenden M Lake and Tomer Ullman and Joshua B. Tenenbaum and Samuel J Gershman} } @article {2261, title = {Cascade of neural processing orchestrates cognitive control in human frontal cortex [code]}, year = {2016}, publisher = {eLife}, abstract = {

Code and data used to create the figures of Tang et al. (2016).\  The results from this work show that there is a dynamic and hierarchical sequence of steps in human frontal cortex orchestrates cognitive control.

Used in conjunction with this mirrored CBMM Dataset entry

}, url = {http://klab.tch.harvard.edu/resources/tangetal_stroop_2016.html}, author = {Hanlin Tang and Hsiang-Yu Yu and Chien-Chen Chou and Crone, Nathan~E. and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {2262, title = {Cascade of neural processing orchestrates cognitive control in human frontal cortex [dataset]}, year = {2016}, publisher = {eLife}, abstract = {

Code and data used to create the figures of Tang et al. (2016).\  The results from this work show that there is a dynamic and hierarchical sequence of steps in human frontal cortex orchestrates cognitive control.

Used in conjunction with this mirrored CBMM Code entry

}, url = {http://klab.tch.harvard.edu/resources/tangetal_stroop_2016.html}, author = {Hanlin Tang and Hsiang-Yu Yu and Chien-Chen Chou and Crone, Nathan~E. and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {1847, title = {Cascade of neural processing orchestrates cognitive control in human frontal cortex}, journal = {eLIFE}, year = {2016}, month = {02/2016}, abstract = {
Rapid and flexible interpretation of conflicting sensory inputs in the context of current goals is a critical component of cognitive control that is orchestrated by frontal cortex. The relative roles of distinct subregions within frontal cortex are poorly understood. To examine the dynamics underlying cognitive control across frontal regions, we took advantage of the spatiotemporal resolution of intracranial recordings in epilepsy patients while subjects resolved color-word conflict.We observed differential activity preceding the behavioral responses to conflict trials throughout frontal cortex; this activity was correlated with behavioral reaction times. These signals emerged first in anterior cingulate cortex (ACC) before dorsolateral prefrontal cortex (dlPFC), followed bymedial frontal cortex (mFC) and then by orbitofrontal cortex (OFC). These results disassociate the frontal subregions based on their dynamics, and suggest a temporal hierarchy for cognitive control in human cortex.
}, doi = {10.7554/eLife.12352}, url = {http://dx.doi.org/10.7554/eLife.12352}, author = {Hanlin Tang and Yu, HY and Chou, CC and NE Crone and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {1821, title = {Children{\textquoteright}s Expectations and Understanding of Kinship as a Social Category}, journal = {Frontiers in Psychology}, volume = { 7}, year = {2016}, pages = {1664-1078}, abstract = {

In order to navigate the social world, children need to understand and make predictions about how people will interact with one another. Throughout most of human history, social groups have been prominently marked by kinship relations, but few experiments have examined children{\textquoteright}s knowledge of and reasoning about kinship relations.\  In the current studies, we investigated how 3- to 5-year-old children understand kinship relations, compared to non-kin relations between friends, with questions such as, {\textquotedblleft}Who has the same grandmother?{\textquotedblright} We also tested how children expect people to interact based on their relations to one another, with questions such as {\textquotedblleft}Who do you think Cara would like to share her treat with?{\textquotedblright} Both in a storybook context and in a richer context presenting more compelling cues to kinship using face morphology, 3- and 4-year-old children failed to show either robust explicit conceptual distinctions between kin and friends, or expectations of behavior favoring kin over friends, even when asked about their own social partners. By 5 years, children{\textquoteright}s understanding of these relations improved, and they showed some expectation that others will preferentially aid siblings over friends.\  Together, these findings suggest that explicit understanding of kinship develops slowly over the preschool years.

}, issn = {1664-1078}, doi = {10.3389/fpsyg.2016.00440 }, url = {http://journal.frontiersin.org/article/10.3389/fpsyg.2016.00440/full}, author = {A C Spokes and Elizabeth S Spelke} } @inbook {2598, title = {Cognitive abilities of infants}, booktitle = {Scientists Making a Difference: One Hundred Eminent Behavioral and Brain Scientists Talk about Their Most Important Contributions}, year = {2016}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {Cambridge, UK}, author = {Elizabeth S Spelke and R. J. Sternberg and S. T. Fiske and D. J. Foss} } @article {2572, title = {Color-Biased Regions of the Ventral Visual Pathway Lie between Face- and Place-Selective Regions in Humans, as in Macaques}, journal = {Journal of Neuroscience}, volume = {36}, year = {2016}, month = {02/2016}, pages = {1682 - 1697}, abstract = {

The existence of color-processing regions in the human ventral visual pathway (VVP) has long been known from patient and imaging studies, but their location in the cortex relative to other regions, their selectivity for color compared with other properties (shape and object category), and their relationship to color-processing regions found in nonhuman primates remain unclear. We addressed these questions by scanning 13 subjects with fMRI while they viewed two versions of movie clips (colored, achromatic) of five different object classes (faces, scenes, bodies, objects, scrambled objects). We identified regions in each subject that were selective for color, faces, places, and object shape, and measured responses within these regions to the 10 conditions in independently acquired data. We report two key findings. First, the three previously reported color-biased regions (located within a band running posterior{\textendash}anterior along the VVP, present in most of our subjects) were sandwiched between face-selective cortex and place-selective cortex, forming parallel bands of face, color, and place selectivity that tracked the fusiform gyrus/collateral sulcus. Second, the posterior color-biased regions showed little or no selectivity for object shape or for particular stimulus categories and showed no interaction of color preference with stimulus category, suggesting that they code color independently of shape or stimulus category; moreover, the shape-biased lateral occipital region showed no significant color bias. These observations mirror results in macaque inferior temporal cortex (Lafer-Sousa and Conway, 2013), and taken together, these results suggest a homology in which the entire tripartite face/color/place system of primates migrated onto the ventral surface in humans over the course of evolution.

SIGNIFICANCE STATEMENT Here we report that color-biased cortex is sandwiched between face-selective and place-selective cortex on the bottom surface of the brain in humans. This face/color/place organization mirrors that seen on the lateral surface of the temporal lobe in macaques, suggesting that the entire tripartite system is homologous between species. This result validates the use of macaques as a model for human vision, making possible more powerful investigations into the connectivity, precise neural codes, and development of this part of the brain. In addition, we find substantial segregation of color from shape selectivity in posterior regions, as observed in macaques, indicating a considerable dissociation of the processing of shape and color in both species.

}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.3164-15.2016}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.3164-15.2016}, author = {R. Lafer-Sousa and B. R. Conway and Nancy Kanwisher} } @article {1808, title = {Continuous representations of action efficiency in infancy}, year = {2016}, month = {01/2016}, abstract = {

In reasoning about action, infants apply the principle of efficiency, recovering attention when agents pursue goals using curvilinear paths if a straight path was available (Csibra et al., 1999). What representations support these capacities? The present research explores the hypothesis that infants represent cost as a continuous function within a naive utility calculus (Jara-Ettinger et al., 2015) by testing 6-month-old infants{\textquoteright} expectations for efficiency using action trajectories differing in curvature. In Study 1, we habituated infants to a rational agent, whose goal-directed actions were constrained by tall barriers, and then measured how long infants attended when the same agent navigated over a novel, low barrier efficiently or inefficiently. In Study 2, we asked whether infants recover attention to inefficient actions solely on the basis of low-level perceptual properties by repeating Study 1 but moving the barrier beyond the agent{\textquoteright}s goal, causing all actions to be unconstrained. In Study 3, we used the unconstrained habituation events from Study 2 and the constrained test events from Study 1 to test whether infants expect an irrational agent to act efficiently given a novel constraint. Across these studies, we demonstrate that 6-month-olds (1) analyze trajectories of goal-directed action differing in curvature on the basis of their efficiency, (2) expect minimally costly action given novel constraints, even for previously irrational agents, and (3) differentiate between these actions on the basis of efficiency, not low-level perceptual differences in height or motion. Our findings indicate that continuous cost representations support an early, robust expectation for rational action.\ 

}, author = {Shari Liu and Elizabeth S Spelke} } @article {2132, title = {Contrastive Analysis with Predictive Power: Typology Driven Estimation of Grammatical Error Distributions in ESL}, year = {2016}, month = {07/2015}, abstract = {

This work examines the impact of crosslinguistic transfer on grammatical errors in English as Second Language (ESL) texts. Using a computational framework that formalizes the theory of Contrastive Analysis (CA), we demonstrate that language specific error distributions in ESL writing can be predicted from the typological properties of the native language and their relation to the typology of English. Our typology driven model enables to obtain accurate estimates of such distributions without access to any ESL data for the target languages. Furthermore, we present a strategy for adjusting our method to low-resource languages that lack typological documentation using a bootstrapping approach which approximates native language typology from ESL texts. Finally, we show that our framework is instrumental for linguistic inquiry seeking to identify first language factors that contribute to a wide range of difficulties in second language acquisition.

}, author = {Yevgeni Berzak and Roi Reichart and Boris Katz} } @conference {2590, title = {Coordinate to cooperate or compete: abstract goals and joint intentions in social interaction}, booktitle = {Proceedings of the 38th Annual Conference of the Cognitive Science Society}, year = {2016}, author = {Max Kleiman-Weiner and Ho, Mark K and Austerweil, Joe L and Michael L, Littman and Joshua B. Tenenbaum} } @inbook {2597, title = {Core knowledge and conceptual change: A perspective on social cognition}, booktitle = {Core Knowledge and Conceptual Change}, year = {2016}, publisher = {Oxford University Press}, organization = {Oxford University Press}, address = {New York}, author = {Elizabeth S Spelke and D. Barner and A. S. Baron} } @article {2760, title = {Critical Behavior from Deep Dynamics: A Hidden Dimension in Natural Language}, journal = {arXiv.org}, year = {2016}, month = {06/2016}, author = {Henry Lin and Max Tegmark} } @article {2619, title = {Decoding task and stimulus representations in face-responsive cortex}, journal = {Cognitive Neuropsychology}, year = {2016}, author = {Dorit Kliemann and Nir Jacoby and Stefano Anzellottti and Rebecca Saxe} } @article {2337, title = {Deep Leaning: Mathematics and Neuroscience}, journal = {A Sponsored Supplement to Science}, volume = {Brain-Inspired intelligent robotics: The intersection of robotics and neuroscience}, year = {2016}, month = {12/2016}, pages = {9-12}, chapter = {9}, abstract = {

Understanding the nature of intelligence is one of the greatest challenges in science and technology today. Making significant progress toward this goal will require the interaction of several disciplines including neuroscience and cognitive science, as well as computer science, robotics, and machine learning. In this paper, I will discuss the implications of recent empirical successes in many applications, such as image categorizations, face identification, localization, action recognition through a machine learning technique called "deep learning," which is based on multi-layer or hierarchical neural networks. Such neural networks have become a central tool in machine learning.

}, url = {http://science.imirus.com/Mpowered/imirus.jsp?volume=scim16\&issue=6\&page=10}, author = {Tomaso Poggio} } @article {2066, title = {Deep Learning: mathematics and neuroscience}, year = {2016}, month = {04/2016}, abstract = {

Science and Engineering of Intelligence

The problems of Intelligence are, together, the greatest problem in science and technology today. Making significant progress towards their solution will require the interaction of sev- eral disciplines involving neuroscience and cognitive science in addition to computer sci- ence, robotics and machine learning...

}, author = {Tomaso Poggio} } @article {3662, title = {Deep vs. shallow networks: An approximation theory perspective}, journal = {Analysis and Applications}, volume = {14}, year = {2016}, month = {01/2016}, pages = {829 - 848}, abstract = {
The paper briefly reviews several recent results on hierarchical architectures for learning from examples, that may formally explain the conditions under which Deep Convolutional Neural Networks perform much better in function approximation problems than shallow, one-hidden layer architectures. The paper announces new results for a non-smooth activation function {\textemdash} the ReLU function {\textemdash} used in present-day neural networks, as well as for the Gaussian networks. We propose a new definition of relative dimension to encapsulate different notions of sparsity of a function class that can possibly be exploited by deep networks but not by shallow ones to drastically reduce the complexity required for approximation and learning.
}, keywords = {blessed representation, deep and shallow networks, Gaussian networks, ReLU networks}, issn = {0219-5305}, doi = {10.1142/S0219530516400042}, url = {http://www.worldscientific.com/doi/abs/10.1142/S0219530516400042}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @article {2183, title = {Deep vs. shallow networks : An approximation theory perspective}, year = {2016}, month = {08/2016}, abstract = {

The paper briefly reviews several recent results on hierarchical architectures for learning from examples, that may formally explain the conditions under which Deep Convolutional Neural Networks perform much better in function approximation problems than shallow, one-hidden layer architectures. The paper announces new results for a non-smooth activation function {\textendash} the ReLU function {\textendash} used in present-day neural networks, as well as for the Gaussian networks. We propose a new definition of relative dimension to encapsulate different notions of sparsity of a function class that can possibly be exploited by deep networks but not by shallow ones to drastically reduce the complexity required for approximation and learning.\ 

Journal submitted version.

}, author = {Hrushikesh Mhaskar and Tomaso Poggio} } @article {2133, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, year = {2016}, month = {09/2016}, abstract = {

Understanding language goes hand in hand with the ability to integrate complex contextual information obtained via perception. In this work, we present a novel task for grounded language understanding: disambiguating a sentence given a visual scene which depicts one of the possible interpretations of that sentence. To this end, we introduce a new multimodal corpus containing ambiguous sentences, representing a wide range of syntactic, semantic and discourse ambiguities, coupled with videos that visualize the different interpretations for each sentence. We address this task by extending a vision model which determines if a sentence is depicted by a video. We demonstrate how such a model can be adjusted to recognize different interpretations of the same underlying sentence, allowing to disambiguate sentences in a unified fashion across the different ambiguity types.

}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @proceedings {2680, title = {DOC: Deep OCclusion Recovering From A Single Image}, year = {2016}, abstract = {

Recovering the occlusion relationships between objects is a fundamental human visual ability which yields important information about the 3D world. In this paper we propose a deep network architecture, called DOC, which acts on a single image, detects object boundaries and estimates the border ownership (i.e. which side of the boundary is foreground and which is background). We represent occlusion relations by a binary edge map, to indicate the object boundary, and an occlusion orientation variable which is tangential to the boundary and whose direction specifies border ownership by a left-hand rule. We train two related deep convolutional neural networks, called DOC, which exploit local and non-local image cues to estimate this representation and hence recover occlusion relations. In order to train and test DOC we construct a large-scale instance occlusion boundary dataset using PASCAL VOC images, which we call the PASCAL instance occlusion dataset (PIOD). This contains 10,000 images and hence is two orders of magnitude larger than existing occlusion datasets for outdoor images. We test two variants of DOC on PIOD and on the BSDS occlusion dataset and show they outperform state-of-the-art methods. Finally, we perform numerous experiments investigating multiple settings of DOC and transfer between BSDS and PIOD, which provides more insights for further study of occlusion estimation.

}, author = {Peng Wang and Alan Yuille} } @article {2809, title = {Dynamic balance of excitation and inhibition in human and monkey neocortex}, journal = {Nature Scientific Reports}, year = {2016}, abstract = {

Balance of excitation and inhibition is a fundamental feature of\ in vivo\ network activity and is important for its computations. However, its presence in the neocortex of higher mammals is not well established. We investigated the dynamics of excitation and inhibition using dense multielectrode recordings in humans and monkeys. We found that in all states of the wake-sleep cycle, excitatory and inhibitory ensembles are well balanced, and co-fluctuate with slight instantaneous deviations from perfect balance, mostly in slow-wave sleep. Remarkably, these correlated fluctuations are seen for many different temporal scales. The similarity of these computational features with a network model of self-generated balanced states suggests that such balanced activity is essentially generated by recurrent activity in the local network and is not due to external inputs. Finally, we find that this balance breaks down during seizures, where the temporal correlation of excitatory and inhibitory populations is disrupted. These results show that balanced activity is a feature of normal brain activity, and break down of the balance could be an important factor to define pathological states.

}, doi = {10.1038/srep23176}, url = {http://www.nature.com/articles/srep23176}, author = {Nima Dehghani} } @conference {1820, title = {Early Reasoning about Affiliation and Social Networks}, booktitle = {International Conference on Infant Studies (ICIS)}, year = {2016}, month = {05/2016}, address = {New Orleans, LA}, author = {A C Spokes and Elizabeth S Spelke} } @conference {1962, title = {Effort as a bridging concept across action and action understanding: Weight and Physical Effort in Predictions of Efficiency in Other Agents}, booktitle = {International Conference on Infant Studies (ICIS) }, year = {2016}, month = {05/2016}, address = {New Orleans, Louisiana}, author = {Tomer Ullman and Joshua B. Tenenbaum and Elizabeth S Spelke} } @article {2748, title = {Environmental statistics enable perceptual separation of sound and space}, year = {2016}, abstract = {

The\ sound that reaches our ears from colliding objects (i.e. bouncing, scraping, rolling etc.) is structured, both by the physical characteristics of the sound source and by environmental reverberation. The inference of any one single parameter\ (mass, size, material, motion, room size, distance) is ill-posed, yet humans can simultaneously identify properties of sound sources and environments from the resulting sound, via mechanisms that remain unclear.\ We investigate whether our ability to recognize sound sources and spaces reflects an ability to separately infer how physical factors effect sound, and whether any such separation is enabled by statistical regularities of real-world sounds and real-world reverberation.\ To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of both solid objects and environmental spaces sampled from the distribution encountered by humans during daily life. Both the objects and the\ sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates. \ Object IRs showed sharp spectral peaks due to strong resonances and environmental IRs showed broad frequency variation:\ mid frequencies reverberated longest while higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air.\ To test whether humans utilize these regularities to separate reverberation from sources, we manipulated environmental IR characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but we found that their abilities degraded when reverberation characteristics deviated from those of real-world environments.\ Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.

}, author = {James Traer and Josh H. McDermott} } @article {1596, title = {Fast, invariant representation for human action in the visual system}, year = {2016}, month = {01/2016}, abstract = {

Isik, L*, Tacchetti, A*, and Poggio, T (* authors contributed equally to this work)

The ability to recognize the actions of others from visual input is essential to humans{\textquoteright} daily lives. The neural computations underlying action recognition, however, are still poorly understood. We use magnetoencephalography (MEG) decoding and a computational model to study action recognition from a novel dataset of well-controlled, naturalistic videos of five actions (run, walk, jump, eat drink) performed by five actors at five viewpoints. We show for the first that that actor- and view-invariant representations for action arise in the human brain as early as 200 ms. We next extend a class of biologically inspired hierarchical computational models of object recognition to recognize actions from videos and explain the computations underlying our MEG findings. This model achieves 3D viewpoint-invariance by the same biologically inspired computational mechanism it uses to build invariance to position and scale. These results suggest that robustness to complex transformations, such as 3D viewpoint invariance, does not require special neural architectures, and further provide a mechanistic explanation of the computations driving invariant action recognition.

}, url = {http://arxiv.org/abs/1601.01358}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {1617, title = {Foveation-based Mechanisms Alleviate Adversarial Examples}, number = {044}, year = {2016}, month = {01/2016}, abstract = {

We show that adversarial examples,\ i.e.,\ the visually imperceptible perturbations that result in Convolutional Neural Networks (CNNs) fail, can be alleviated with a mechanism based on foveations---applying the CNN in different image regions. To see this, first, we report results in ImageNet that lead to a revision of the hypothesis that adversarial perturbations are a consequence of CNNs acting as a linear classifier: CNNs act locally linearly to changes in the image regions with objects recognized by the CNN, and in other regions the CNN may act non-linearly. Then, we corroborate that when the neural responses are linear, applying the foveation mechanism to the adversarial example tends to significantly reduce the effect of the perturbation. This is because, hypothetically, the CNNs for ImageNet are robust to changes of scale and translation of the object produced by the foveation, but this property does not generalize to transformations of the perturbation. As a result, the accuracy after a foveation is almost the same as the accuracy of the CNN without the adversarial perturbation, even if the adversarial perturbation is calculated taking into account a foveation.

}, author = {Luo, Yan and X Boix and Gemma Roig and Tomaso Poggio and Qi Zhao} } @article {2612, title = {From agents to actions to interactions: Uncovering multiple social networks in the primate brain}, year = {2016}, abstract = {

Our brain continuously decodes the complex visual scenes unwinding in front of us: both the nature of material entities we perceive, such as objects and individuals, and their immaterial interactions. Interactions are recognize quickly and effortlessly by primates: They understand fights, grooming and plays, but also colliding objects that exchange forces following physical laws of classical mechanics. Interactions are fundamental in that they reveal hidden properties of objects, e.g. their weight or material, and of individuals, e.g. their dominance status or relationship, and by doing so they determine and teach the observer about its own position and prospects regarding those entities. However little is known about the brain regions that track and process social and physical interactions. In order to chart these regions, videos of three types of interactions 1) social interactions between monkeys, 2) interactions between monkeys and objects or their environment and 3) physical interactions between objects, were projected to four rhesus monkeys being scanned for fMRI acquisition with contrast agent. Whole-brain activity for watching blocks of interactions was compared to the activity for watching control videos of monkeys making no actions, objects moving with no interactions, landscapes and scrambled motion videos using Fixed Effects (FFX) Generalized Linear Model (GLM) group analysis and conjunction analyses. We show that watching interactions over-activates the STS, but engages also two sets of regions located outside: 1) it activates the fronto-parietal mirror neuron system (mapped independently using a classic localizer) more than watching non-interactive goal directed behaviors that define the system; 2) in the case of social interactions, it additionally exclusively activates the medial-prefrontal cortex (mPFC), a putative temporo-parietal junction homolog and the temporal pole (TP) that appear to correspond to the human mentalizing network. These two networks are fed differentially by patches of STS cortex (mapped independently using a classic Face-Object-Body patch localizer): face patches co-activate with the social brain, while body patches co-activate with both the mirror neuron system and the social brain. These results demonstrate that combining individuals or objects into evocative units modulates basic mechanisms of object and individual perception in the STS, they reveal the mirror neuron system{\textquoteright}s nature of node of convergence between the social and non-social brain, and suggest that human unique and sophisticated mind-reading ability evolved from the faculty shared with our monkey kin to read social interactions.

}, author = {J. Sliwa and W. A. Freiwald} } @article {2570, title = {Functional neuroanatomy of intuitive physical inference}, journal = {Proceedings of the National Academy of Sciences}, volume = {113}, year = {2016}, month = {06/2016}, pages = {E5072 - E5081}, abstract = {

To engage with the world{\textemdash}to understand the scene in front of us, plan actions, and predict what will happen next{\textemdash}we must have an intuitive grasp of the world{\textquoteright}s physical structure and dynamics. How do the objects in front of us rest on and support each other, how much force would be required to move them, and how will they behave when they fall, roll, or collide? Despite the centrality of physical inferences in daily life, little is known about the brain mechanisms recruited to interpret the physical structure of a scene and predict how physical events will unfold. Here, in a series of fMRI experiments, we identified a set of cortical regions that are selectively engaged when people watch and predict the unfolding of physical events{\textemdash}a {\textquotedblleft}physics engine{\textquotedblright} in the brain. These brain regions are selective to physical inferences relative to nonphysical but otherwise highly similar scenes and tasks. However, these regions are not exclusively engaged in physical inferences per se or, indeed, even in scene understanding; they overlap with the domain-general {\textquotedblleft}multiple demand{\textquotedblright} system, especially the parts of that system involved in action planning and tool use, pointing to a close relationship between the cognitive and neural mechanisms involved in parsing the physical content of a scene and preparing an appropriate action.

}, issn = {0027-8424}, doi = {10.1073/pnas.1610344113}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1610344113}, author = {Fischer, Jason and Mikhael, John G. and Joshua B. Tenenbaum and Nancy Kanwisher} } @article {2788, title = {Functional neuroanatomy of intuitive physical inference.}, year = {2016} } @conference {1961, title = {The Functions of Infants{\textquoteright} Social Categorization: Early Reasoning about Affiliation and Social Networks}, booktitle = {International Conference on Infant Studies (ICIS)}, year = {2016}, month = {05/2016}, address = {New Orleans, Louisiana}, author = {A C Spokes and Elizabeth S Spelke} } @conference {1983, title = {Generation and Comprehension of Unambiguous Object Descriptions}, booktitle = {The Conference on Computer Vision and Pattern Recognition (CVPR)}, year = {2016}, month = {06/2016}, address = {Las Vegas, Nevada}, abstract = {
We propose a method that can generate an unambiguous description (known as a referring expression) of a specific object or region in an image, and which can also comprehend or interpret such an expression to infer which object is being described.
We show that our method outperforms previous methods that generate descriptions of objects without taking into account other potentially ambiguous objects in the scene.
Our model is inspired by recent successes of deep learning methods for image captioning, but while image captioning is difficult to evaluate,  our task allows for easy objective evaluation.
We also present a new large-scale dataset for referring expressions, based on
MS-COCO.
We have released the dataset and a toolbox for visualization and evaluation, see \url{https://github.com/mjhucla/Google_Refexp_toolbox}.
}, url = {https://github.com/ mjhucla/Google_Refexp_toolbox}, author = {Junhua Mao and Jonathan Huang and Alexander Toshev and Oana Camburu and Alan Yuille and Kevin Murphy} } @article {1594, title = {Group Invariant Deep Representations for Image Instance Retrieval}, year = {2016}, month = {01/2016}, abstract = {

Most image instance retrieval pipelines are based on comparison of vectors known as global image descriptors between a query image and the database images. Due to their success in large scale image classification, representations extracted from Convolutional Neural Networks (CNN) are quickly gaining ground on Fisher Vectors (FVs) as state-of-the-art global descriptors for image instance retrieval. While CNN-based descriptors are generally remarked for good retrieval performance at lower bitrates, they nevertheless present a number of drawbacks including the lack of robustness to common object transformations such as rotations compared with their interest point based FV counterparts.


In this paper, we propose a method for computing invariant global descriptors from CNNs. Our method implements a recently proposed mathematical theory for invariance in a sensory cortex modeled as a feedforward neural network. The resulting global descriptors can be made invariant to multiple arbitrary transformation groups while retaining good discriminativeness.


Based on a thorough empirical evaluation using several publicly available datasets, we show that our method is able to significantly and consistently improve retrieval results every time a new type of invariance is incorporated. We also show that our method which has few parameters is not prone to over fitting: improvements generalize well across datasets with different properties with regard to invariances. Finally, we show that our descriptors are able to compare favourably to other state-of-theart compact descriptors in similar bitranges, exceeding the highest retrieval results reported in the literature on some datasets. A dedicated dimensionality reduction step {\textendash}quantization or hashing{\textendash} may be able to further improve the competitiveness of the descriptors.

}, author = {Olivier Mor{\`e}re and Antoine Veillard and Jie Lin and Julie Petta and Vijay Chandrasekhar and Tomaso Poggio} } @article {2811, title = {High-frequency oscillations in human and monkey neocortex during the wake{\textendash}sleep cycle}, journal = {Proceedings of the National Academy of Sciences}, year = {2016}, doi = {10.1073/pnas.1523583113}, url = {http://www.pnas.org/content/113/33/9363}, author = {Michel Le Van Quyen and Lyle Muller and Bartosz Telenczuk and Eric Halgren and Sydney Cash and Nicholas Hatsopoulos and Nima Dehghani and Alain Destexhe} } @conference {1813, title = {Holographic Embeddings of Knowledge Graphs}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, address = {Phoenix, Arizona, USA}, abstract = {

Learning embeddings of entities and relations is an efficient and versatile method to perform machine learning on relational data such as knowledge graphs. In this work, we propose holographic embeddings (HolE) to learn compositional vector space representations of entire knowledge graphs. The proposed method is related to holographic models of associative memory in that it employs circular correlation to create compositional representations. By using correlation as the compositional operator HolE can capture rich interactions but simultaneously remains efficient to compute, easy to train, and scalable to very large datasets. In extensive experiments we show that holographic embeddings are able to outperform state-of-the-art methods for link prediction in knowledge graphs and relational learning benchmark datasets.

}, author = {Maximilian Nickel and Lorenzo Rosasco and Tomaso Poggio} } @conference {2629, title = {How Important Is Weight Symmetry in Backpropagation?}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, address = {Phoenix, AZ.}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections -- the same weights must be used for forward and backward passes. This "weight transport problem" (Grossberg 1987) is thought to be one of the main reasons to doubt BP{\textquoteright}s biologically plausibility. Using 15 different classification datasets, we systematically investigate to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration (Lillicrap et al. 2014) but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter -- the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) (Ioffe and Szegedy 2015) and/or a "Batch Manhattan" (BM) update rule.

}, url = {https://cbmm.mit.edu/sites/default/files/publications/liao-leibo-poggio.pdf}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @conference {1651, title = {How Important Is Weight Symmetry in Backpropagation?}, booktitle = {Thirtieth AAAI Conference on Artificial Intelligence (AAAI-16)}, year = {2016}, month = {Accepted}, publisher = {Association for the Advancement of Artificial Intelligence}, organization = {Association for the Advancement of Artificial Intelligence}, address = {Phoenix, AZ.}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections -- the same weights must be used for forward and backward passes. This "weight transport problem" (Grossberg 1987) is thought to be one of the main reasons to doubt BP{\textquoteright}s biologically plausibility. Using 15 different classification datasets, we systematically investigate to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration (Lillicrap et al. 2014) but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter -- the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) (Ioffe and Szegedy 2015) and/or a "Batch Manhattan" (BM) update rule.

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {1818, title = {How Infants Reason About Affective States and Social Interactions}, year = {2016}, month = {05/2016}, address = {New Orleans, Louisiana }, author = {A C Spokes}, editor = {Elizabeth S Spelke} } @conference {2585, title = {Human Pose Estimation Using Deep Consensus Voting}, booktitle = {ECCV 2016}, year = {2016}, abstract = {

In this paper we consider the problem of human pose estimation from a single still image.\  We propose a novel approach where each location in the\  image\  votes\  for\  the\  position\  of\  each\  keypoint\  using\  a\  convolutional neural net.\  The voting scheme allows us to utilize information from the whole image, rather than rely on a sparse set of keypoint locations.\  Using dense, multi-target votes, not only produces good keypoint predictions, but also enables us to compute image-dependent joint keypoint probabilities by looking at consensus voting.\  This differs from most previous methods where joint probabilities are learned from relative keypoint locations and are independent of the image.\  We finally combine the keypoints votes and joint probabilities in order to identify the optimal pose configuration.\  We show our competitive performance on the MPII Human Pose and Leeds Sports Pose datasets.

}, author = {Ita Lifshitz and Ethan Fetaya and Shimon Ullman} } @article {2768, title = {Improved Measures of Integrated Information}, journal = {PLOS Computational Biology}, year = {2016}, month = {11/2016}, abstract = {

Although\  there\  is\  growing\  interest\  in\  measuring\  integrated\  information\  in\  computational\  and cognitive systems, current methods for doing so in practice are computationally unfeasible.\  Existing and\  novel\  integration\  measures\  are\  investigated\  and\  classified\  by\  various\  desirable\  properties.\ \  A simple taxonomy of Φ-measures is presented where they are each characterized by their choice of factorization\  method\  (5\  options),\  choice\  of\  probability\  distributions\  to\  compare\  (3 {\texttimes} 4\  options) and choice of measure for comparing probability distributions (7 options).\  When requiring the Φ- measures to satisfy a minimum of attractive properties, these hundreds of options reduce to a mere handful, some of which turn out to be identical.\  Useful exact and approximate formulas are derived that can be applied to real-world data from laboratory experiments without posing unreasonable computational demands.

}, doi = {10.1371/journal.pcbi.100512310.1371}, url = {http://dx.plos.org/10.1371/journal.pcbi.1005123}, author = {Max Tegmark}, editor = {Seth, Anil} } @article {1792, title = {Individual Differences in Face Looking Behavior Generalize from the Lab to the World}, journal = {Journal of Vision}, volume = {16}, year = {2016}, month = {05/2016}, chapter = {12}, abstract = {

Recent laboratory studies have found large, stable individual differences in the location people first fixate when identifying faces, ranging from the brows to the mouth. Importantly, this variation is strongly associated with differences in fixation-specific identification performance such that an individual{\textquoteright}s recognition ability is maximized when looking at their preferred location (Mehoudar, Arizpe, Baker, \& Yovel, 2014; Peterson \& Eckstein, 2013). This finding suggests that face representations are retinotopic and individuals enact gaze strategies that optimize identification, yet the extent to which this behavior reflects real-world gaze behavior is unknown. Here, we used mobile eye-trackers to test whether individual differences in face-gaze generalize from lab to real-world vision. In-lab fixations were measured with a speeded face identification task, while real-world behavior was measured as subjects freely walked around the MIT campus. We found a strong correlation between the patterns of individual differences in face-gaze in the laboratory and real-world settings. Our findings support the hypothesis that individuals optimize real-world face identification by consistently fixating the same location and thus strongly constraining the space of retinotopic input. The methods developed for this study entailed collecting a large set of high-definition, wide field-of-view natural videos from head-mounted cameras and the viewer{\textquoteright}s fixation position, allowing us to characterize subject{\textquoteright}s actually-experienced real-world retinotopic images. These images enable us to ask how vision is optimized not just for the statistics of the {\textquotedblleft}natural images{\textquotedblright} found in web databases, but of the truly natural, retinotopic images that have landed on actual human retinae during real-world experience.

}, doi = {10.1167/16.7.12.}, url = {http://jov.arvojournals.org/article.aspx?articleid=2524135\&resultClick=1}, author = {M.F. Peterson and Jing Lin and Ian Zaun and Nancy Kanwisher} } @article {2867, title = {Individual differences in face-looking behavior generalize from the lab to the world.}, journal = {Journal of Vision}, year = {2016}, author = {M.F. Peterson and J. Lin and Ian Zaun and Nancy Kanwisher} } @article {2135, title = {The infancy of the human brain}, year = {2016}, month = {10/2016}, abstract = {

The human infant brain is the only known machine able to master a natural language and develop explicit, symbolic, and communicable systems of knowledge that deliver rich representations of the external world. With the emergence of non-invasive brain imaging, we now have access to the unique neural machinery underlying these early accomplishments. After describing early cognitive capacities in the domains of language and number, we review recent findings that underline the strong continuity between human infants{\textquoteright} and adults{\textquoteright} neural architecture, with notably early hemispheric asymmetries and involvement of frontal areas. Studies of the strengths and limitations of early learning, and of brain dynamics in relation to regional maturational stages, promise to yield a better understanding of the sources of human cognitive achievements.

}, doi = {http://dx.doi.org/10.1016/j.neuron.2015.09.026}, author = {Dehaene-Lambertz, G. and Elizabeth S Spelke} } @article {2787, title = {Inferring mass in complex scenes by mental simulation.}, year = {2016} } @conference {2600, title = {Integrating Identification and Perception: A case study of familiar and unfamiliar face processing}, booktitle = {Proceedings of the Thirty-Eight Annual Conference of the Cognitive Science Society}, year = {2016}, month = {2016}, author = {Kelsey Allen and Ilker Yildirim and Joshua B. Tenenbaum} } @article {2142, title = {Introduction Special issue: Deep learning}, journal = {Information and Inference}, volume = {5}, year = {2016}, pages = {103-104}, abstract = {

Faced with large amounts of data, the aim of machine learning is to make predictions. It applies to many types of data, such as images, sounds, biological data, etc. A key difficulty is to find relevant vectorial representations. While this problem had been often handled in a ad-hoc way by domain experts, it has recently proved useful to learn these representations directly from large quantities of data, and Deep Learning Convolutional Networks (DLCN) with ReLU nonlinearities have been particularly successful. The representations are then based on compositions of simple parameterized processing units, the depth coming from the large number of such compositions.

The goal of this special issue was to explore some of the mathematical ideas and problems at the heart of deep learning. In particular, two key mathematical questions about deep learning are:

These questions are still open and a full theory of Deep Learning is still in the making. This special issue, however, begins with two papers that provide a useful contribution to several other theoretical questions surrounding supervised deep learning.

}, doi = {10.1093/imaiai/iaw010}, url = {http://imaiai.oxfordjournals.org/content/5/2/103.short}, author = {Bach, Francis and Tomaso Poggio} } @inbook {1722, title = {Intuitive theories}, booktitle = {Oxford Handbook of Causal Reasoning}, year = {2016}, month = {02/2016}, publisher = {Oxford University Press}, organization = {Oxford University Press}, author = {Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {2098, title = {On invariance and selectivity in representation learning}, journal = {Information and Inference: A Journal of the IMA}, year = {2016}, month = {05/2016}, pages = {iaw009}, abstract = {

We study the problem of learning from data representations that are invariant to transformations, and at the same time selective, in the sense that two points have the same representation if one is the transformation of the other. The mathematical results here sharpen some of the key claims of i-theory{\textemdash}a recent theory of feedforward processing in sensory cortex (Anselmi et al., 2013, Theor. Comput. Sci. and arXiv:1311.4158; Anselmi et al., 2013, Magic materials: a theory of deep hierarchical architectures for learning sensory representations. CBCL Paper; Anselmi \& Poggio, 2010, Representation learning in sensory cortex: a theory. CBMM Memo No. 26).

}, issn = {2049-8764}, doi = {10.1093/imaiai/iaw009}, url = {http://imaiai.oxfordjournals.org/lookup/doi/10.1093/imaiai/iaw009}, author = {F. Anselmi and Lorenzo Rosasco and Tomaso Poggio} } @article {2740, title = {The invisible hand: Toddlers connect probabilistic events with agentive causes}, journal = {Cognitive Science}, volume = {40}, year = {2016}, pages = {23}, chapter = {1854}, abstract = {

Children posit unobserved causes when events appear to occur spontaneously (e.g., Gelman \& Gottfried, 1996).\  What about when events appear to occur probabilistically? Here toddlers (mean: 20.1 months) saw arbitrary causal relationships (Cause A generated Effect A; Cause B generated Effect B) in a fixed, alternating order. The relationships were then changed in one of two ways.\  In the Deterministic condition, the event order changed (Event B preceded Event A); in the Probabilistic condition, the causal relationships changed (Cause A generated Effect B; Cause B generated Effect A). As intended, toddlers looked equally long at both changes (Experiment 1). We then introduced a previously unseen candidate cause.\  Toddlers looked longer at the appearance of a hand (Experiment 2) and novel agent (Experiment 3) in the Deterministic than the Probabilistic conditions, but looked equally long at novel non-agents (Experiment 4), suggesting that by two, toddlers connect probabilistic events with unobserved agents.

}, author = {Wu, Yang and Muentener, Paul and Laura Schulz} } @article {2108, title = {Is it time for a presidential technoethics commission}, year = {2016}, month = {05/2016}, publisher = {The Conversation}, abstract = {

{\textquotedblleft}A recent New York Times article highlighted the growing integration of technologies and textiles, displaying a photograph of a delicate golden nest of optical fiber. The article reported that this new {\textquotedblleft}functional fabric{\textquotedblright} has the added quality that it {\textquotedblleft}acts as an optical bar code to identify who is wearing it.{\textquotedblright}

Is this a feature or a bug? This smart material would certainly be a new milestone in the march of technology and the marketplace to erode personal privacy. Would a suit made of this material need to come with a warning label? Just because we have the technological capability to do something like this, should we?

Similar questions could have been asked about putting GPS technology in our mobile phones, drones in the air and the {\textquotedblleft}cookies{\textquotedblright} resident on our devices to dutifully record and transmit our online activity. Right now, those conversations happen in corporate boardrooms, the media, fictional books and films, and academic settings. But there isn{\textquoteright}t a broad national conversation around the ethics of the steady digital encroachment on our lives. Is it time to create a presidential commission on technoethics? ...{\textquotedblright}

}, url = {https://theconversation.com/is-it-time-for-a-presidential-technoethics-commission-58846}, author = {Dan Rockmore} } @article {1885, title = {Language and Vision Ambiguities (LAVA) Corpus}, year = {2016}, month = {01/2016}, abstract = {

Ambiguity is one of the defining characteristics of human languages, and language understanding crucially relies on the ability to obtain unambiguous representations of linguistic content. While some ambiguities can be resolved using intra-linguistic contextual cues, the disambiguation of many linguistic constructions requires integration of world knowledge and perceptual information obtained from other modalities. In this work, we focus on the problem of grounding language in the visual modality, and introduce a novel task for visual and linguistic understanding which requires resolving linguistic ambiguities by utilizing the visual context of the utterance.

To address this challenge, we release the Language and Vision Ambiguities (LAVA) corpus. LAVA contains ambiguous sentences coupled with visual scenes that depict the different interpretations of each sentence. The sentences in the corpus are annotated with syntactic and semantic parses, and cover a wide range of linguistic ambiguities, including PP and VP attachment, conjunctions, logical forms, anaphora and ellipsis. In addition to the sentence disambiguation challenge, the corpus will support a variety of related tasks which use natural language as a medium for expressing visual understanding.

Reference:
Yevgeni Berzak, Andrei Barbu, Daniel Harari, Boris Katz, and Shimon Ullman (2015). Do You See What I Mean? Visual Resolution of Linguistic Ambiguities. Conference on Empirical Methods in Natural Language Processing (EMNLP), Lisbon, Portugal. [PDF]

Download all of the clips in MP4 format (ZIP)

}, url = {http://web.mit.edu/lavacorpus/}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {1741, title = {Learning Functions: When Is Deep Better Than Shallow}, year = {2016}, abstract = {

While the universal approximation property holds both for hierarchical and shallow networks, we prove that deep (hierarchical) networks can approximate the class of compositional functions with the same accuracy as shallow networks but with exponentially lower number of training parameters as well as VC-dimension. This theorem settles an old conjecture by Bengio on the role of depth in networks. We then define a general class of scalable, shift-invariant algorithms to show a simple and natural set of requirements that justify deep convolutional networks.

}, url = {https://arxiv.org/pdf/1603.00988v4.pdf}, author = {Hrushikesh Mhaskar and Qianli Liao and Tomaso Poggio} } @article {1797, title = {Learning mid-level codes for natural sounds}, year = {2016}, month = {02/2016}, address = {Salt Lake City, UT}, abstract = {

Auditory perception depends critically on abstract and behaviorally meaningful representations of natural auditory scenes. These representations are implemented by cascades of neuronal processing stages in which neurons at each stage recode outputs of preceding units. Explanations of auditory coding strategies must thus involve understanding how low-level acoustic patterns are combined into more complex structures. While models exist in the visual domain to explain how phase invariance is achieved by V1 complex cells, and how curvature representations emerge in V2, little is known about analogous grouping principles for mid-level auditory representations.

We propose a hierarchical, generative model of natural sounds that learns combinations of spectrotemporal features from natural stimulus statistics. In the first layer the model forms a sparse, convolutional code of spectrograms. Features learned on speech and environmental sounds resemble spectrotemporal receptive fields (STRFs) of mid-brain and cortical neurons, consistent with previous findings [1]. To generalize from specific STRF activation patterns, the second layer encodes patterns of time-varying magnitude (i.e. variance) of multiple first layer coefficients. Because it forms a code of a non- stationary distribution of STRF activations, it is partially invariant to their specific values. Moreover, because second-layer features are sensitive to STRF combinations, the representation they support is more selective to complex acoustic patterns. The second layer substantially improved the model{\textquoteright}s performance on a denoising task, implying a closer match to the natural stimulus distribution.

Quantitative hypotheses emerge from the model regarding selectivity of auditory neurons characterized by multidimensional STRFs [2] and sensitivity to increasingly more abstract structure [3]. The model also predicts that the auditory system constructs representations progressively more invariant to noise, consistent with recent experimental findings [4]. Our results suggest that mid-level auditory representations may be derived from high-order stimulus dependencies present in the natural environment.\ 

}, url = {http://www.cosyne.org/c/index.php?title=Cosyne2016_posters_2}, author = {Wiktor Mlynarski and Josh H. McDermott} } @article {2664, title = {Learning Mid-Level Codes for Natural Sounds}, year = {2016}, author = {Wiktor Mlynarski and Josh H. McDermott} } @conference {2583, title = {Learning to Answer Questions from Wikipedia Infoboxes}, booktitle = {The 2016 Conference on Empirical Methods on Natural Language Processing (EMNLP 2016)}, year = {2016}, abstract = {

A natural language interface to answers on the Web can help us access information more ef- ficiently.\  We start with an interesting source of information{\textemdash}infoboxes\  in Wikipedia that summarize factoid knowledge{\textemdash}and develop a comprehensive\  approach\  to\  answering\  ques- tions\  with\  high\  precision.\ \ \  We\  first\  build\  a system to access data in infoboxes in a struc- tured manner. We use our system to construct a crowdsourced dataset of over 15,000 high- quality,\  diverse\  questions.\ \  With\  these\  ques- tions, we train a convolutional neural network model\  that\  outperforms\  models\  that\  achieve top results in similar answer selection tasks.

}, author = {Alvaro Morales and Varot Premtoon and Cordelia Avery and Sue Felshin and Boris Katz} } @proceedings {2618, title = {Lecture Notes in Computer ScienceComputer Vision {\textendash} ECCV 2016Ambient Sound Provides Supervision for Visual Learning}, year = {2016}, month = {10/2016}, pages = {801 - 816}, address = {Cham}, abstract = {

The sound of crashing waves, the roar of fast-moving cars {\textendash} sound conveys important information about the objects in our surroundings. In this work, we show that ambient sounds can be used as a supervisory signal for learning visual models. To demonstrate this, we train a convolutional neural network to predict a statistical summary of the sound associated with a video frame. We show that, through this process, the network learns a representation that conveys information about objects and scenes. We evaluate this representation on several recognition tasks, finding that its performance is comparable to that of other state-of-the-art unsupervised learning methods. Finally, we show through visualizations that the network learns units that are selective to objects that are often associated with characteristic sounds.

}, keywords = {convolutional networks, Sound, unsupervised learning}, isbn = {978-3-319-46447-3}, issn = {0302-9743}, doi = {10.1007/978-3-319-46448-010.1007/978-3-319-46448-0_48}, url = {http://link.springer.com/10.1007/978-3-319-46448-0}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and William T. Freeman and Torralba, Antonio} } @article {2319, title = {A look back at the June 2016 BMM Workshop in Sestri Levante, Italy}, year = {2016}, month = {11/2016}, abstract = {

"On June 20th 2016, the first of a series of workshops on the science of intelligence kicked off in Sestri Levante, Italy. Organized by the Center for Brains, Minds, and Machines (CBMM), the Italian Institute of Technology (IIT), and the Max Plank Institution for Biological Cybernetics, this three-day workshop brought together an international cast of researchers to discuss human and machine intelligence. Computer scientists, cognitive scientists, and neuroscientists collaborated in a wide-ranging conversation about integrating different approaches to intelligence, both artificial and human, into a coherent science of intelligence..."

View the BMM Workshop in Sestri Levante page and watch the videos.

}, author = {Boris Katz and Andrei Barbu} } @article {2665, title = {Lossy Compression of Sound Texture by the Human Auditory System}, year = {2016}, author = {Wiktor Mlynarski and Josh H. McDermott} } @proceedings {2121, title = {A machine learning approach to predict episodic memory formation}, year = {2016}, pages = {539 - 544 }, address = {Princeton, NJ }, doi = {10.1109/CISS.2016.7460560}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=7460560\&newsearch=true\&queryText=A\%20machine\%20learning\%20approach\%20to\%20predict\%20episodic\%20memory\%20formation}, author = {Hanlin Tang and Jedediah Singer and Matias J. Ison and Gnel Pivazyan and Melissa Romaine and Elizabeth Meller and Victoria Perron and Marlise Arlellano and Gabriel Kreiman and Melissa Romaine and Adrianna Boulin and Rosa Frias and James Carroll and Sarah Dowcett} } @conference {2640, title = {Making learning count: A large-scale randomized control trial testing the effects of core mathematical training on school readiness in young children.}, booktitle = {International Mind, Brain and Education Society}, year = {2016} } @article {2067, title = {Marvin L. Minsky (1927{\textendash}2016) Scientist and inventor was a visionary founder of AI}, year = {2016}, month = {03/2016}, abstract = {

In 1950, Alan Turing argued that there was no convincing reason to doubt that computers could be intelligent. Then, in the late 1950s, Marvin Minsky, John McCarthy, Herbert Simon, and Allen Newell started the work that led everyone to think of them as the founders of the field of Artificial Intelligence. Marvin Minsky, the last of the founders, died on January 24, 2016.
Minsky championed the idea that computers would someday think like we think and surpass us. He had no patience with those who doubted that computers could be intelligent at a human level or beyond. In the early days, there were many doubters, and were it not for Minsky{\textquoteright}s determined advocacy, the field of AI might have foundered...

}, author = {Patrick Henry Winston} } @article {2595, title = {Mastery of the logic of natural numbers is not the result of mastery of counting: Evidence from late counters. }, journal = {Developmental Science}, year = {2016}, doi = {10.1111/desc.12459}, author = {Julian Jara-Ettinger and Steve Piantadosi and Elizabeth S Spelke and Roger Levy and Edward Gibson} } @article {2326, title = {Measuring and modeling the perception of natural and unconstrained gaze in humans and machines}, year = {2016}, month = {11/2016}, abstract = {

Humans are remarkably adept at interpreting the gaze direction of other individuals in their surroundings. This skill is at the core of the ability to engage in joint visual attention, which is essential for establishing social interactions. How accurate are humans in determining the gaze direction of others in lifelike scenes, when they can move their heads and eyes freely, and what are the sources of information for the underlying perceptual processes? These questions pose a challenge from both empirical and computational perspectives, due to the complexity of the visual input in real-life situations. Here we measure empirically human accuracy in perceiving the gaze direction of others in lifelike scenes, and study computationally the sources of information and representations underlying this cognitive capacity. We show that humans perform better in face-to-face conditions compared with recorded conditions, and that this advantage is not due to the availability of input dynamics. We further show that humans are still performing well when only the eyes-region is visible, rather than the whole face. We develop a computational model, which replicates the pattern of human performance, including the finding that the eyes-region contains on its own, the required information for estimating both head orientation and direction of gaze. Consistent with neurophysiological findings on task-specific face regions in the brain, the learned computational representations reproduce perceptual effects such as the Wollaston illusion, when trained to estimate direction of gaze, but not when trained to recognize objects or faces.

}, keywords = {computational evaluation, computational modeling, Computer vision, empirical evaluation, estimation of gaze direction, Gaze perception, joint attention, Machine Learning}, author = {Daniel Harari and Tao Gao and Nancy Kanwisher and Joshua B. Tenenbaum and Shimon Ullman} } @article {2765, title = {Mechanisms of color perception and cognition covered by$\#$ thedress}, volume = {16}, year = {2016}, month = {8/2016}, pages = {746-746}, address = {Journal of Vision}, abstract = {

Color is notoriously ambiguousmany color illusions existbut until now it has been thought that all people with normal color vision experience color illusions the same way. How does the visual system resolve color ambiguity? Here, we present work that addresses this question by quantifying peoples perception of a particularly ambiguous image, the dress photograph. The colors of the individual pixels in the photograph when viewed in isolation are light-blue or brown, but popular accounts suggest the dress appears either white/gold or blue/black. We tested more than 1400 people, both on-line and under controlled laboratory conditions. Subjects first completed the sentence: this is a ___and___dress. Then they performed a color-matching experiment that did not depend on language. Surprisingly, the results uncovered three groups of subjects: white/gold observers, blue/black observers and blue/brown observers. Our findings show that the brain resolves ambiguity in the dress into one of three stable states\&semi; a minority of people switched which colors they saw (~11\%). It is clear that what we see depends on both retinal stimulation and internal knowledge about the world. Cases of multi-stability such as the dress provide a rare opportunity to investigate this interplay. In particular, we go on to demonstrate that the dress photograph can be used as a tool to discover that skin reflectance is a particularly important implicit cue used by the brain to estimate the color of the light source, to resolve color ambiguity, shedding light on the role of high-level cues in color perception.

}, doi = {10.1167/16.12.746}, author = {B. R. Conway and Rosa Lafer-Sousa and Katherine Hermann} } @article {2789, title = {Mechanisms of color perception and cognition covered by $\#$thedress}, volume = {16}, year = {2016}, month = {9/2016}, pages = {746}, type = {Conference Talk}, abstract = {

Color is notoriously ambiguousmany color illusions existbut until now it has been thought that all people with normal color vision experience color illusions the same way. How does the visual system resolve color ambiguity? Here, we present work that addresses this question by quantifying peoples perception of a particularly ambiguous image, the dress photograph. The colors of the individual pixels in the photograph when viewed in isolation are light-blue or brown, but popular accounts suggest the dress appears either white/gold or blue/black. We tested more than 1400 people, both on-line and under controlled laboratory conditions. Subjects first completed the sentence: this is a ___and___dress. Then they performed a color-matching experiment that did not depend on language. Surprisingly, the results uncovered three groups of subjects: white/gold observers, blue/black observers and blue/brown observers. Our findings show that the brain resolves ambiguity in the dress into one of three stable states\&semi; a minority of people switched which colors they saw (~11\%). It is clear that what we see depends on both retinal stimulation and internal knowledge about the world. Cases of multi-stability such as the dress provide a rare opportunity to investigate this interplay. In particular, we go on to demonstrate that the dress photograph can be used as a tool to discover that skin reflectance is a particularly important implicit cue used by the brain to estimate the color of the light source, to resolve color ambiguity, shedding light on the role of high-level cues in color perception.

}, author = {B. R. Conway and R. Lafer-Sousa and Katherine Hermann} } @proceedings {1865, title = {Modeling Human Ad Hoc Coordination}, year = {2016}, month = {02/2016}, author = {Peter Krafft and Chris Baker and Alex "Sandy" Pentland and Joshua B. Tenenbaum} } @conference {1864, title = {Modeling human understanding of complex intentional action with a Bayesian nonparametric subgoal model}, booktitle = {AAAI}, year = {2016}, month = {02/2016}, author = {Ryo Nakahashi and Chris Baker and Joshua B. Tenenbaum} } @article {2786, title = {The naive utility calculus: Computational principles underlying commonsense psychology}, year = {2016} } @article {2492, title = {The naive utility calculus: computational principles underlying social cognition}, journal = {Trends Cogn Sci.}, year = {2016}, doi = {10.1016/j.tics.2016.05.011}, url = {https://www.ncbi.nlm.nih.gov/pubmed/27388875}, author = {Julian Jara-Ettinger and Hyowon Gweon and Laura Schulz and Joshua B. Tenenbaum} } @proceedings {1723, title = {Natural science: Active learning in dynamic physical microworlds}, year = {2016}, publisher = {38th Annual Meeting of the Cognitive Science Society}, author = {Neil Bramley and Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {3281, title = {Nested Invariance Pooling and RBM Hashing for Image Instance Retrieval}, journal = {arXiv.org}, year = {2016}, month = {03/2016}, abstract = {

The goal of this work is the computation of very compact binary hashes for image instance retrieval. Our approach has two novel contributions. The first one is Nested Invariance Pooling (NIP), a method inspired from i-theory, a mathematical theory for computing group invariant transformations with feed-forward neural networks. NIP is able to produce compact and well-performing descriptors with visual representations extracted from convolutional neural networks. We specifically incorporate scale, translation and rotation invariances but the scheme can be extended to any arbitrary sets of transformations. We also show that using moments of increasing order throughout nesting is important. The NIP descriptors are then hashed to the target code size (32-256 bits) with a Restricted Boltzmann Machine with a novel batch-level regularization scheme specifically designed for the purpose of hashing (RBMH). A thorough empirical evaluation with state-of-the-art shows that the results obtained both with the NIP descriptors and the NIP+RBMH hashes are consistently outstanding across a wide range of datasets.

}, keywords = {CNN, Hashing, Image Instance Retrieval, Invariant Representation, Regularization, unsupervised learning}, url = {https://arxiv.org/abs/1603.04595}, author = {Olivier Mor{\`e}re and Antoine Veillard and Vijay Chandrasekhar and Tomaso Poggio} } @article {2064, title = {Neural Information Processing Systems (NIPS) 2015 Review}, year = {2016}, month = {01/2016}, abstract = {

The charming city of Montreal hosted more than 4000 researchers from all over the globe during the Neural Information Processing Systems (NIPS) conference. In addition to the notable exponential growth in the number of attendees, a novel highlight this year was the addition of a Symposium format. The Brain, Minds and Machines Symposium aimed to discuss the relationship between biological hardware and how to understand the fundamental computations that give rise to intelligence...

View the CBMM NIPS page and watch the videos.

}, author = {Gabriel Kreiman} } @article {2198, title = {Neural Representations Integrate the Current Field of View with the Remembered 360{\textdegree} Panorama}, journal = {Current Biology}, year = {2016}, month = {09/08/2016}, abstract = {

We experience our visual environment as a seamless, immersive panorama. Yet, each view is discrete and fleeting, separated by expansive eye movements and discontinuous views of our spatial surroundings. How are discrete views of a panoramic environment knit together into a broad, unified memory representation? Regions of the brain{\textquoteright}s {\textquotedblleft}scene network{\textquotedblright} are well poised to integrate retinal input and memory [ 1 ]: they are visually driven [ 2, 3 ] but also densely interconnected with memory structures in the medial temporal lobe [ 4 ]. Further, these regions harbor memory signals relevant for navigation [ 5{\textendash}8 ] and adapt across overlapping shifts in scene viewpoint [ 9, 10 ]. However, it is unknown whether regions of the scene network support visual memory for the panoramic environment outside of the current field of view and, further, how memory for the surrounding environment influences ongoing perception. Here, we demonstrate that specific regions of the scene network{\textemdash}the retrosplenial complex (RSC) and occipital place area (OPA){\textemdash}unite discrete views of a\ 360{\textdegree} panoramic environment, both current and out of sight, in a common representational space. Further, individual scene views prime associated representations of the panoramic environment in behavior, facilitating subsequent perceptual judgments. We propose that this dynamic interplay between memory and perception plays an important role in weaving the fabric of continuous visual experience.

}, doi = {10.1016/j.cub.2016.07.002}, url = {http://www.cell.com/current-biology/abstract/S0960-9822(16)30753-9}, author = {Robertson, Caroline~E. and Katherine Hermann and Mynick, Anna and Kravitz, Dwight~J. and Nancy Kanwisher} } @article {1892, title = {Neural Tuning Size in a Model of Primate Visual Processing Accounts for Three Key Markers of Holistic Face Processing}, journal = {Public Library of Science | PLoS ONE }, volume = {1(3): e0150980}, year = {2016}, month = {03/2016}, abstract = {

Faces are an important and unique class of visual stimuli, and have been of interest to neuroscientists\  for many years. Faces are known to elicit certain characteristic behavioral markers, collectively labeled {\textquotedblleft}holistic processing{\textquotedblright}, while non-face objects are not processed\  holistically. However, little is known about the underlying neural mechanisms. The main aim of this computational simulation work is to investigate the neural mechanisms that make
face processing holistic. Using a model of primate visual processing, we show that a single key factor, {\textquotedblleft}neural tuning size{\textquotedblright}, is able to account for three important markers of holistic face processing: the Composite Face Effect (CFE), Face Inversion Effect (FIE) and Whole-Part Effect (WPE). Our proof-of-principle specifies the precise neurophysiological property that corresponds to the poorly-understood notion of holism, and shows that this one neural property controls three classic behavioral markers of holism. Our work is consistent with neurophysiological evidence, and makes further testable predictions. Overall, we provide a parsimonious account of holistic face processing, connecting computation, behavior and neurophysiology.

}, doi = {10.1371/journal.pone.0150980}, url = {http://journals.plos.org/plosone/article?id=10.1371\%2Fjournal.pone.0150980}, author = {Cheston Tan and Tomaso Poggio} } @article {2542, title = {New Data Science tools for analyzing neural data and computational models}, year = {2016}, abstract = {

As the amount of data collected by neuroscientists continues to increase (Stevenson et al, 2011), new tools are needed to turn this data into insights into about the algorithms that underlie complex behavior (Brown et al, 2004). Here we present our latest research on computational tools we have developed at Hampshire College and at the Center for Brains, Minds and Machines at MIT. In particular, we describe new tools for neural population decoding including a graphical user interface to the Neural Decoding Toolbox (Meyers 2013), methods for analyzing single neurons, and ongoing work on a parallelized population decoding framework that uses R and Apache Spark{\texttrademark} to greatly increase the speed of population decoding. We also discuss CBaaS, which is a distributed platform that allows one to evaluate the effectiveness of different computational models (such as different versions of deep neural networks). These tools will allow researchers to gain deeper insights from the data they collect, and to better assess whether computational models are acting in similar ways to biological systems.\ 

}, author = {Ethan Meyers and Mike Dean and Gregory J Hale} } @inbook {2286, title = {Object and Scene Perception}, booktitle = {From Neuron to Cognition via Computational Neuroscience}, year = {2016}, publisher = {The MIT Press}, organization = {The MIT Press}, chapter = {17}, address = {Cambridge, MA, USA}, abstract = {

Overview

This textbook presents a wide range of subjects in neuroscience from a computational perspective. It offers a comprehensive, integrated introduction to core topics, using computational tools to trace a path from neurons and circuits to behavior and cognition. Moreover, the chapters show how computational neuroscience{\textemdash}methods for modeling the causal interactions underlying neural systems{\textemdash}complements empirical research in advancing the understanding of brain and behavior.

The chapters{\textemdash}all by leaders in the field, and carefully integrated by the editors{\textemdash}cover such subjects as action and motor control; neuroplasticity, neuromodulation, and reinforcement learning; vision; and language{\textemdash}the core of human cognition.

The book can be used for advanced undergraduate or graduate level courses. It presents all necessary background in neuroscience beyond basic facts about neurons and synapses and general ideas about the structure and function of the human brain. Students should be familiar with differential equations and probability theory, and be able to pick up the basics of programming in MATLAB and/or Python. Slides, exercises, and other ancillary materials are freely available online, and many of the models described in the chapters are documented in the brain operation database, BODB (which is also described in a book chapter).

Available now through MIT Press - https://mitpress.mit.edu/neuron-cognition

}, isbn = {9780262034968}, url = {https://mitpress.mit.edu/neuron-cognition}, author = {Owen Lewis and Tomaso Poggio} } @article {2571, title = {The occipital place area represents the local elements of scenes}, journal = {NeuroImage}, volume = {132}, year = {2016}, month = {02/2016}, pages = {417 - 424}, abstract = {

Neuroimaging studies have identified three scene-selective regions in human cortex: parahippocampal place area (PPA), retrosplenial complex (RSC), and occipital place area (OPA). However, precisely what scene information each region represents is not clear, especially for the least studied, more posterior OPA. Here we hypothesized that OPA represents local elements of scenes within two independent, yet complementary scene descriptors: spatial boundary (i.e., the layout of external surfaces) and scene content (e.g., internal objects). If OPA processes the local elements of spatial boundary information, then it should respond to these local elements (e.g., walls) themselves, regardless of their spatial arrangement. Indeed, we found that OPA, but not PPA or RSC, responded similarly to images of intact rooms and these same rooms in which the surfaces were fractured and rearranged, disrupting the spatial boundary. Next, if OPA represents the local elements of scene content information, then it should respond more when more such local elements (e.g., furniture) are present. Indeed, we found that OPA, but not PPA or RSC, responded more to multiple than single pieces of furniture. Taken together, these findings reveal that OPA analyzes local scene elements - both in spatial boundary and scene content representation - while PPA and RSC represent global scene properties.

}, issn = {10538119}, doi = {10.1016/j.neuroimage.2016.02.062}, url = {https://www.ncbi.nlm.nih.gov/pubmed/26931815}, author = {Kamps, Frederik S. and Julian, Joshua B. and Jonas Kubilius and Nancy Kanwisher and Dilks, Daniel D.} } @article {1823, title = {Pragmatic Reasoning through Semantic Inference}, journal = {Semantics and Pragmatics}, volume = { Vol 9 (2016) }, year = {2016}, abstract = {

A number of recent proposals have used techniques from game theory and Bayesian cognitive science to formalize Gricean pragmatic reasoning (Franke 2009, Frank \& Goodman 2012, Goodman \& Stuhlmüller 2013, J{\"a}ger 2012). We discuss two phenomena which pose a challenge to these accounts of pragmatics: M-implicatures (Horn 1984) and embedded implicatures which violate Hurford{\textquoteright}s constraint (Hurford 1974, Chierchia et al. 2012). While techniques have been developed for deriving M-implicatures, Hurford-violating embedded implicatures pose a more fundamental challenge, because of basic limitations in the models{\textquoteright} architecture. In order to explain these phenomena, we propose a realignment of the division between semantic content and pragmatic content. Under this proposal, the semantic content of an utterance is not fixed independent of pragmatic inference; rather, pragmatic inference partially determines an utterance{\textquoteright}s semantic content. We show how semantic inference can be realized as an extension to the Rational Speech Acts framework (Goodman \& Stuhlmüller 2013). The addition of lexical uncertainty derives both M-implicatures and the relevant embedded implicatures, and preserves the derivations of more standard implicatures. We use this principle to explain a novel class of implicature, non-convex disjunctive implicatures, which have several theoretically interesting properties. In particular, these implicatures can be preserved in downward-entailing contexts in the absence of accenting, a property which is predicted by lexical uncertainty, but which violates prior generalizations in the literature (Horn 1989, Fox \& Spector Forthcoming).


This is an early access version of Bergen, Leon, Roger Levy, Noah Goodman. 2016. Pragmatic reasoning through semantic inference. Semantics and Pragmatics 9(20). This version will be replaced with the final typeset version in due course. NB:page numbers will change, so cite with caution.

This is an open-access article distributed under the terms of a Creative Commons Attribution License (http://creativecommons.org/licenses/by/3.0/).

}, issn = {ISSN: 1937-8912}, doi = {http://dx.doi.org/10.3765/sp.9.20}, url = {http://semprag.org/article/view/sp.9.20}, author = {Leon Bergen and Roger Levy and Noah D. Goodman} } @article {2565, title = {Predicting episodic memory formation for movie events}, journal = {Scientific Reports}, year = {2016}, month = {10/2016}, abstract = {

Episodic memories are long lasting and full of detail, yet imperfect and malleable. We quantitatively evaluated recollection of short audiovisual segments from movies as a proxy to real-life memory formation in 161 subjects at 15 minutes up to a year after encoding. Memories were reproducible within and across individuals, showed the typical decay with time elapsed between encoding and testing, were fallible yet accurate, and were insensitive to low-level stimulus manipulations but sensitive to high-level stimulus properties. Remarkably, memorability was also high for single movie frames, even one year post-encoding. To evaluate what determines the efficacy of long-term memory formation, we developed an extensive set of content annotations that included actions, emotional valence, visual cues and auditory cues. These annotations enabled us to document the content properties that showed a stronger correlation with recognition memory and to build a machine-learning computational model that accounted for episodic memory formation in single events for group averages and individual subjects with an accuracy of up to 80\%. These results provide initial steps towards the development of a quantitative computational theory capable of explaining the subjective filtering steps that lead to how humans learn and consolidate memories.

}, doi = {10.1038/srep30175}, url = {http://www.nature.com/articles/srep30175}, author = {Hanlin Tang and Jedediah Singer and Matias J. Ison and Gnel Pivazyan and Melissa Romaine and Rosa Frias and Elizabeth Meller and Adrianna Boulin and James Carroll and Victoria Perron and Sarah Dowcett and Arellano, Marlise and Gabriel Kreiman} } @article {2885, title = {Predicting episodic memory formation for movie events [code]}, year = {2016}, abstract = {

Episodic memories are long lasting and full of detail, yet imperfect and malleable. We quantitatively\  evaluated recollection of short audiovisual segments from movies as a proxy to real-life memory\  formation in 161 subjects at 15\  minutes up to a year after encoding. Memories were reproducible within\  and across individuals, showed the typical decay with time elapsed between encoding and testing,\  were fallible yet accurate, and were insensitive to low-level stimulus manipulations but sensitive to\  high-level stimulus properties. Remarkably, memorability was also high for single movie frames, even\  one year post-encoding. To evaluate what determines the efficacy of long-term memory formation,\  we developed an extensive set of content annotations that included actions, emotional valence, visual\  cues and auditory cues. These annotations enabled us to document the content properties that showed\  a stronger correlation with recognition memory and to build a machine-learning computational model\  that accounted for episodic memory formation in single events for group averages and individual\  subjects with an accuracy of up to 80\%. These results provide initial steps towards the development of a\  quantitative computational theory capable of explaining the subjective filtering steps that lead to how\  humans learn and consolidate memories.


To view more information and dowload datasets, etc. please visit the project website - http://klab.tch.harvard.edu/resources/Tangetal_episodicmemory_2016.html$\#$sthash.cj1STRah.bumwWxcX.dpbs


The corresponding publication can be found here.


The corresponding code entry can be found here.

}, author = {Hanlin Tang and Jedediah Singer and Matias Ison and Gnel Pivazyan and Melissa Romaine and Rosa Frias and Elizabeth Meller and Adrianna Boulin and James Carroll and Victoria Perron and Sarah Dowcett and Marlise Arlellano and Gabriel Kreiman} } @article {2886, title = {Predicting episodic memory formation for movie events [dataset]}, year = {2016}, abstract = {

Episodic memories are long lasting and full of detail, yet imperfect and malleable. We quantitatively\  evaluated recollection of short audiovisual segments from movies as a proxy to real-life memory\  formation in 161 subjects at 15\  minutes up to a year after encoding. Memories were reproducible within\  and across individuals, showed the typical decay with time elapsed between encoding and testing,\  were fallible yet accurate, and were insensitive to low-level stimulus manipulations but sensitive to\  high-level stimulus properties. Remarkably, memorability was also high for single movie frames, even\  one year post-encoding. To evaluate what determines the efficacy of long-term memory formation,\  we developed an extensive set of content annotations that included actions, emotional valence, visual\  cues and auditory cues. These annotations enabled us to document the content properties that showed\  a stronger correlation with recognition memory and to build a machine-learning computational model\  that accounted for episodic memory formation in single events for group averages and individual\  subjects with an accuracy of up to 80\%. These results provide initial steps towards the development of a\  quantitative computational theory capable of explaining the subjective filtering steps that lead to how\  humans learn and consolidate memories.


To view more information and dowload datasets, etc. please visit the project website - http://klab.tch.harvard.edu/resources/Tangetal_episodicmemory_2016.html$\#$sthash.cj1STRah.bumwWxcX.dpbs


The corresponding publication can be found here.


The corresponding code entry can be found here.

}, author = {Hanlin Tang and Jedediah Singer and Matias Ison and Gnel Pivazyan and Melissa Romaine and Rosa Frias and Elizabeth Meller and Adrianna Boulin and James Carroll and Victoria Perron and Sarah Dowcett and Marlise Arlellano and Gabriel Kreiman} } @article {2882, title = {PredNet - "Deep Predictive Coding Networks for Video Prediction and Unsupervised Learning" [code]}, year = {2016}, abstract = {

The PredNet is a deep convolutional recurrent neural network inspired by the principles of predictive coding from the neuroscience literature [1, 2]. It is trained for next-frame video prediction with the belief that prediction is an effective objective for unsupervised (or "self-supervised") learning [e.g. 3-11].


For full project information and links to download code, etc. visit the website - https://coxlab.github.io/prednet/

}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {1810, title = {Pre-reaching infants expect causal agents to act efficiently without motor training}, year = {2016}, month = {05/2016}, author = {Shari Liu and Neon B. Brooks and Elizabeth S Spelke} } @article {1959, title = {Preverbal Infants{\textquoteright} Third-Party Imitator Preferences: Animated Displays versus Filmed Actors}, year = {2016}, month = {05/2016}, address = {New Orleans, Louisiana}, abstract = {

Research on social imitation shows that, from toddlerhood to adulthood, individuals respond more positively to social partners who imitate them compared to those who do not (Chartrand \& Bargh, 1999; Meltzoff, 1990; Agnetta \& Rochat, 2004). It is unknown, however, whether (i) positive responses to imitation are present in the first year of life, before infants engage in robust social imitation themselves and (ii) whether positive evaluations of imitators are restricted to direct, 1st person interactions. A recent study found that infants 13 months old and younger who observed imitative and non-imitative interactions between animated, geometric figures looked at and reached for imitators more than non-imitators, but found no difference in looking to targets of imitation versus non-targets (Powell \& Spelke, in prep). These data suggest infants may recognize and prefer imitators on the basis of 3rd party observation. However, it is unknown whether this pattern would generalize from simplified, animated displays to more ecologically valid stimuli (e.g. videos of complex human movement).

In our current study, we tested 4- to 5.5-month-old infants (N = 112) using animations and videos of human actors. For both stimulus types, infants saw either two individuals take turns responding to a third, one imitating and one not (responders condition), or one individual respond to two others, imitating one but not the other (initiators condition; Figure 1). In the animated stimuli, each character jumped and made either a high- or low-pitched sound. In the video stimuli each actor made one of two hand motions modified from American Sign Language. The sound or motion matched in imitative interactions but not in non-imitative ones. Depending on condition, the experiment concluded with a differential looking test between either the imitating and non-imitating responder or the imitated and non-imitated initiator. A repeated measures ANOVA revealed a significant main effect of condition (F(1,97) = 6.58; P \< .05) but not of stimulus type (P \> 0.8) on infants{\textquoteright} looking during the preference test. Regardless of stimulus type, infants in the responders condition looked longer to imitators (M = 6.79 s) than non-imitators (M = 4.38 s; t(47) = 4.05 P \< 0.001), while infants in the targets condition did not differentiate between targets (M = 5.89 s) and non-targets of imitation (M = 6.09 s; t(48) = 0.80; P \> 0.2). We are replicating the responders condition using videos of new actors performing simpler actions. Preliminary results indicate infants (N = 14 of an intended 32) continue to look longer at imitators than non-imitators. The congruent results obtained with both animated and video stimuli confirm the validity of the use of animated stimuli for studying infant social cognition. Additionally, these results demonstrate that young infants recognize imitation as 3rd party observers and are biased to attend more to those who have imitated others. Consistent with social imitation research, our results suggest infants may have an early-emerging preference for imitators. This potential preference may lay the foundation for the capacity to engage in socially guided learning.\ 

}, author = {Heather L Kosakowski and Lindsey J Powell and Elizabeth S Spelke} } @article {2109, title = {Probing the compositionality of intuitive functions}, year = {2016}, month = {05/2016}, abstract = {

How do people learn about complex functional structure? Taking inspiration from other areas of cognitive science, we propose that this is accomplished by harnessing compositionality: complex structure is decomposed into simpler building blocks. We formalize this idea within the framework of Bayesian regression using a grammar over Gaussian process kernels. We show that participants prefer compositional over non-compositional function extrapolations, that samples from the human prior over functions are best described by a compositional model, and that people perceive compositional functions as more predictable than their non-compositional but otherwise similar counterparts. We argue that the compositional nature of intuitive functions is consistent with broad principles of human cognition.

}, author = {Eric Schulz and Joshua B. Tenenbaum and David Duvenaud and Maarten Speekenbrink and Samuel J Gershman} } @article {2531, title = {Rapid Physical Predictions from Convolutional Neural Networks}, year = {2016}, url = {http://phys.csail.mit.edu/papers/9.pdf}, author = {Filipe Peres and Kevin A Smith and Joshua B. Tenenbaum} } @article {2723, title = {Recognizing and Interpreting Social Interactions in Local Image Regions}, year = {2016}, note = {

(Accepted for oral presentation)

}, month = {11/2016}, abstract = {

Understanding social interactions (such as {\textquoteright}hug{\textquoteright}\ or {\textquoteright}fight{\textquoteright}) is a basic and important capacity of the human visual system, but a challenging and still open problem for modeling. Here we study visual recognition of social interactions, based on small but recognizable local regions. The approach is based on two novel key components: (i) A given social interaction can be recognized reliably from reduced images (called {\textquoteright}minimal images{\textquoteright}). (ii) The recognition of a social interaction depends on identifying components and relations within the minimal image (termed {\textquoteright}interpretation{\textquoteright}). We show psychophysics data for minimal images and modeling results for their interpretation.\ 

}, author = {Guy Ben-Yosef and Alon Yachin and Shimon Ullman} } @article {1812, title = {A Review of Relational Machine Learning for Knowledge Graphs}, journal = {Proceedings of the IEEE}, volume = {104}, year = {2016}, month = {Jan-01-2016}, pages = {11 - 33}, abstract = {

Relational machine learning studies methods for the statistical analysis of relational, or graph-structured, data. In this paper, we provide a review of how such statistical models can be {\textquotedblleft}trained{\textquotedblright} on large knowledge graphs, and then used to predict new facts about the world (which is equivalent to predicting new edges in the graph). In particular, we discuss two fundamentally different kinds of statistical relational models, both of which can scale to massive data sets. The first is based on latent feature models such as tensor factorization and multiway neural networks. The second is based on mining observable patterns in the graph. We also show how to combine these latent and observable models to get improved modeling power at decreased computational cost. Finally, we discuss how such statistical models of graphs can be combined with text-based information extraction methods for automatically constructing knowledge graphs from the Web. To this end, we also discuss Google{\textquoteright}s knowledge vault project as an example of such combination.

}, issn = {0018-9219}, doi = {10.1109/JPROC.2015.2483592}, url = {http://ieeexplore.ieee.org/lpdocs/epic03/wrapper.htm?arnumber=7358050}, author = {Maximilian Nickel and Kevin Murphy and Tresp, Volker and Gabrilovich, Evgeniy} } @article {2065, title = {Review of the CBMM workshop on the Turing++ Question: {\textquoteright}who is there?{\textquoteright}}, year = {2016}, month = {01/2016}, abstract = {

From the 3rd to the 5th of September 2015, the Center for Brains Minds and Machines hosted a workshop to addressed the first Turing++ Question: {\textquoteleft}who is there?{\textquoteright}. The workshop invited experts from the fields of computer vision, cognitive science and neuroscience to engage in a discussion about what are the neural algorithms and the underlying neural circuits that support the ability of humans and other primates to recognize faces. The goal of the workshop was to generate new ideas about how to make progress into understanding the neural algorithms that underlie face identification...

}, author = {Ethan Meyers} } @article {2576, title = {Spatio-temporal convolutional networks explain neural representations of human actions}, year = {2016}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {2617, title = {Statistics of natural reverberation enable perceptual separation of sound and space}, journal = {Proceedings of the National Academy of Sciences}, volume = {113}, year = {2016}, month = {09/2016}, pages = {E7856 - E7865}, abstract = {

In everyday listening, sound reaches our ears directly from a source as well as indirectly via reflections known as reverberation. Reverberation profoundly distorts the sound from a source, yet humans can both identify sound sources and distinguish environments from the resulting sound, via mechanisms that remain unclear. The core computational challenge is that the acoustic signatures of the source and environment are combined in a single signal received by the ear. Here we ask whether our recognition of sound sources and spaces reflects an ability to separate their effects and whether any such separation is enabled by statistical regularities of real-world reverberation. To first determine whether such statistical regularities exist, we measured impulse responses (IRs) of 271 spaces sampled from the distribution encountered by humans during daily life. The sampled spaces were diverse, but their IRs were tightly constrained, exhibiting exponential decay at frequency-dependent rates: Mid frequencies reverberated longest whereas higher and lower frequencies decayed more rapidly, presumably due to absorptive properties of materials and air. To test whether humans leverage these regularities, we manipulated IR decay characteristics in simulated reverberant audio. Listeners could discriminate sound sources and environments from these signals, but their abilities degraded when reverberation characteristics deviated from those of real-world environments. Subjectively, atypical IRs were mistaken for sound sources. The results suggest the brain separates sound into contributions from the source and the environment, constrained by a prior on natural reverberation. This separation process may contribute to robust recognition while providing information about spaces around us.

}, keywords = {auditory scene analysis, environmental acoustics, natural scene statistics, psychoacoustics, Psychophysics}, issn = {0027-8424}, doi = {10.1073/pnas.1612524113}, url = {http://www.pnas.org/lookup/doi/10.1073/pnas.1612524113}, author = {James Traer and Josh H. McDermott} } @article {2243, title = {Streaming Normalization: Towards Simpler and More Biologically-plausible Normalizations for Online and Recurrent Learning}, year = {2016}, month = {10/2016}, abstract = {

We systematically explored a spectrum of normalization algorithms related to Batch Normalization (BN) and propose a generalized formulation that simultaneously solves two major limitations of BN: (1) online learning and (2) recurrent learning. Our proposal is simpler and more biologically-plausible. Unlike previous approaches, our technique can be applied out of the box to all learning scenarios (e.g., online learning, batch learning, fully-connected, convolutional, feedforward, recurrent and mixed {\textemdash} recurrent and convolutional) and compare favorably with existing approaches. We also propose Lp Normalization for normalizing by different orders of statistical moments. In particular, L1 normalization is well-performing, simple to implement, fast to compute, more biologically-plausible and thus ideal for GPU or hardware implementations.

}, author = {Qianli Liao and Kenji Kawaguchi and Tomaso Poggio} } @article {2321, title = {Theory I: Why and When Can Deep Networks Avoid the Curse of Dimensionality?}, year = {2016}, month = {11/2016}, abstract = {

[formerly titled "Why and When Can Deep - but Not Shallow - Networks Avoid the Curse of Dimensionality: a Review"]

The paper reviews and extends an emerging body of theoretical results on deep learning including the conditions under which it can be exponentially better than shallow learning. A class of deep convolutional networks represent an important special case of these conditions, though weight sharing is not the main reason for their exponential advantage. Implications of a few key theorems are discussed, together with new results, open problems and conjectures.

}, author = {Tomaso Poggio and Hrushikesh Mhaskar and Lorenzo Rosasco and Brando Miranda and Qianli Liao} } @article {2883, title = {There{\textquoteright}s Waldo! A Normalization Model of Visual Search Predicts Single-Trial Human Fixations in an Object Search Task [code]}, year = {2016}, abstract = {

When searching for an object in a scene, how does the brain decide where to look next? Visual search theories suggest the existence of a global {\textquotedblleft} priority map {\textquotedblright} that integrates bottom-up visual information with top-down, target-speci fi c signals. We propose a mechanistic model of visual search that is consistent with recent neurophysiological evidence, can localize targets in cluttered images, and predicts single-trial behavior in a search task. This model posits that a high-level retinotopic area selective for shape features receives global, target-speci fi c modulation and implements local normalization through divisive inhibition. The normalization step is critical to prevent highly salient bottom-up features from monopolizing attention. The resulting activity pattern constitues a priority map that tracks the correlation between local input and target features. The maximum of this priority map is selected as the locus of attention. The visual input is then spatially enhanced around the selected location, allowing object-selective visual areas to determine whether the target is present at this location. This model can localize objects both in array images and when objects are pasted in natural scenes. The model can also predict single-trial human fi xations, including those in error and target-absent trials, in a search task involving complex objects.


To view more information and dowload code, etc. please visit the project website - http://klab.tch.harvard.edu/resources/miconietal_visualsearch_2016.html$\#$sthash.KmHoBPsk.XILaGVDV.dpbs


The corresponding publication can be found here.


The corresponding dataset entry can be found here.

}, author = {Thomas Miconi and Laura Groomes and Gabriel Kreiman} } @article {2884, title = {There{\textquoteright}s Waldo! A Normalization Model of Visual Search Predicts Single-Trial Human Fixations in an Object Search Task [dataset]}, year = {2016}, abstract = {

When searching for an object in a scene, how does the brain decide where to look next? Visual search theories suggest the existence of a global {\textquotedblleft} priority map {\textquotedblright} that integrates bottom-up visual information with top-down, target-speci fi c signals. We propose a mechanistic model of visual search that is consistent with recent neurophysiological evidence, can localize targets in cluttered images, and predicts single-trial behavior in a search task. This model posits that a high-level retinotopic area selective for shape features receives global, target-speci fi c modulation and implements local normalization through divisive inhibition. The normalization step is critical to prevent highly salient bottom-up features from monopolizing attention. The resulting activity pattern constitues a priority map that tracks the correlation between local input and target features. The maximum of this priority map is selected as the locus of attention. The visual input is then spatially enhanced around the selected location, allowing object-selective visual areas to determine whether the target is present at this location. This model can localize objects both in array images and when objects are pasted in natural scenes. The model can also predict single-trial human fi xations, including those in error and target-absent trials, in a search task involving complex objects.


To view more information and dowload datasets, etc. please visit the project website - http://klab.tch.harvard.edu/resources/miconietal_visualsearch_2016.html$\#$sthash.KmHoBPsk.XILaGVDV.dpbs


The corresponding publication can be found here.


The corresponding code entry can be found here.

}, author = {Thomas Miconi and Laura Groomes and Gabriel Kreiman} } @article {2124, title = {There{\textquoteright}s Waldo! A Normalization Model of Visual Search Predicts Single-Trial Human Fixations in an Object Search Task}, journal = {Cerebral Cortex}, volume = {26(7)}, year = {2016}, pages = {26:3064-3082}, abstract = {

When searching for an object in a scene, how does the brain decide where to look next? Visual search theories suggest the existence of a global {\textquotedblleft}priority map{\textquotedblright} that integrates bottom-up visual information with top-down, target-specific signals. We propose a mechanistic model of visual search that is consistent with recent neurophysiological evidence, can localize targets in cluttered images, and predicts single-trial behavior in a search task. This model posits that a high-level retinotopic area selective for shape features receives global, target-specific modulation and implements local normalization through divisive inhibition. The normalization step is critical to prevent highly salient bottom-up features from monopolizing attention. The resulting activity pattern constitues a priority map that tracks the correlation between local input and target features. The maximum of this priority map is selected as the locus of attention. The visual input is then spatially enhanced around the selected location, allowing object-selective visual areas to determine whether the target is present at this location. This model can localize objects both in array images and when objects are pasted in natural scenes. The model can also predict single-trial human fixations, including those in error and target-absent trials, in a search task involving complex objects.

---

Publisher released this paper early online on June 19, 2015.

}, author = {Thomas Miconi and Laura Groomes and Gabriel Kreiman} } @article {3395, title = {To What Extent Does Global Shape Influence Category Representation in the Brain?}, journal = {Journal of Neuroscience}, volume = {36}, year = {2016}, month = {Jan-04-2017}, pages = {4149 - 4151}, issn = {0270-6474}, doi = {10.1523/JNEUROSCI.0387-16.2016}, url = {http://www.jneurosci.org/cgi/doi/10.1523/JNEUROSCI.0387-16.2016}, author = {N. Apurva Ratan Murty and Pramod, R. T.} } @conference {2584, title = {Training and Evaluating Multimodal Word Embeddings with Large-scale Web Annotated Images}, booktitle = {NIPS 2016}, year = {2016}, abstract = {

In this paper, we focus on training and evaluating effective word embeddings with both text and visual information.\  More specifically, we introduce a large-scale dataset with 300 million sentences describing over 40 million images crawled and downloaded from publicly available Pins (i.e. an image with sentence descriptions uploaded by users) on Pinterest [ 2 ]. This dataset is more than 200 times larger than MS COCO [ 22 ], the standard large-scale image dataset with sentence descriptions. In addition, we construct an evaluation dataset to directly assess the effectiveness of word embeddings in terms of finding semantically similar or related words and phrases. The word/phrase pairs in this evaluation dataset are collected from the click data with millions of users in an image search system,\  thus contain rich semantic relationships.\  Based on these datasets, we propose and compare several Recurrent Neural Networks (RNNs) based multimodal (text and image) models. Experiments show that our model benefits from incorporating the visual information into the word embeddings, and a weight sharing strategy is crucial for learning such multimodal embeddings. The project page is: http://www.stat. ucla.edu/~junhua.mao/multimodal_embedding.html 1 .

}, author = {Junhua Mao and Jianjing Xu and Yushi Jing and Alan Yuille} } @article {2825, title = {Treebank of Learner English (TLE)}, year = {2016}, month = {08/2016}, abstract = {

The majority of the English text available worldwide is generated by non-native speakers. Learner language introduces a variety of challenges\ and is of paramount importance for the scientific study of language acquisition as well as for Natural Language Processing. Despite the ubiquity of non-native English, there has been no publicly available syntactic treebank for English as a Second Language (ESL). To address this shortcoming, we released the Treebank of Learner English (TLE), a first of its kind resource for non-native English, containing 5,124 sentences manually annotated with Part of Speech (POS) tags and syntactic dependency trees. Full syntactic analyses are provided for both the original and error corrected versions of each sentence. We also introduced annotation guidelines that allow for consistent syntactic treatment of ungrammatical English. We envision the treebank to support a wide range of linguistic and computational research on language learning as well as automatic processing of ungrammatical language.

}, url = {http://esltreebank.org/}, author = {Yevgeni Berzak and Jessica Kenney and Carolyn Spadine and Jing Xian Wang and Lucia Lam and Keiko Sophie Mori and Sebastian Garza and Boris Katz} } @article {2351, title = {The Trolley Problem [Edge.com]}, year = {2016}, month = {12/2016}, publisher = {Edge.com}, chapter = {2017 : WHAT SCIENTIFIC TERM OR CONCEPT OUGHT TO BE MORE WIDELY KNOWN?}, abstract = {

"The history of science is littered with {\textquotedblleft}thought experiments,{\textquotedblright} a term dreamed up by Albert Einstein ({\textquotedblleft}gedankenexperiment{\textquotedblright}) for an imagined scenario able to sharply articulate the crux of some intellectual puzzle, and in so doing excite some deep thinking on the way to a solution or related discovery. Among the most famous are Einstein{\textquoteright}s tale of chasing a light beam that led him to a theory of special relativity and Erwin Schr{\"o}dinger{\textquoteright}s story of the poor cat, stuck in a fiendishly designed quantum mechanical box, forever half-alive and half-dead, that highlighted the complex interactions between wave mechanics and measurement.

{\textquotedblleft}The Trolley Problem{\textquotedblright} is another thought experiment, one that arose in moral philosophy. There are many versions, but here is one: A trolley is rolling down the tracks and reaches a branchpoint. To the left, one person is trapped on the tracks, and to the right, five people. You can throw a switch that diverts the trolley from the track with the five to the track with the one. Do you? The trolley can{\textquoteright}t brake. What if we know more about the people on the tracks? Maybe the one is a child and the five are elderly? Maybe the one is a parent and the others are single? How do all these different scenarios change things? What matters? What are you valuing and why?..."

}, url = {https://www.edge.org/response-detail/27051}, author = {Dan Rockmore} } @article {2047, title = {Turing++ Questions: A Test for the Science of (Human) Intelligence.}, journal = { AI Magazine}, volume = {37 }, year = {2016}, month = {03/2016}, pages = {73-77}, abstract = {

It is becoming increasingly clear that there is an infinite number of definitions of intelligence. Machines that are intelligent in different narrow ways have been built since the 50s. We are entering now a golden age for the engineering of intelligence and the development of many different kinds of intelligent machines. At the same time there is a widespread interest among scientists in understanding a specific and well defined form of intelligence, that is human intelligence. For this reason we propose a stronger version of the original Turing test. In particular, we describe here an open-ended set of Turing++ Questions that we are developing at the Center for Brains, Minds and Machines at MIT {\textemdash} that is questions about an image. Questions may range from what is there to who is there, what is this person doing, what is this girl thinking about this boy and so on.\  The plural in questions is to emphasize that there are many different intelligent abilities in humans that have to be characterized, and possibly replicated in a machine, from basic visual recognition of objects, to the identification of faces, to gauge emotions, to social intelligence, to language and much more. The term Turing++ is to emphasize that our goal is understanding human intelligence at all Marr{\textquoteright}s levels {\textemdash} from the level of the computations to the level of the underlying circuits. Answers to the Turing++ Questions should thus be given in terms of models that match human behavior and human physiology {\textemdash} the mind and the brain. These requirements are thus well beyond the original Turing test. A whole scientific field that we call the science of (human) intelligence is required to make progress in answering our Turing++ Questions. It is connected to neuroscience and to the engineering of intelligence but also separate from both of them.

}, doi = {http://dx.doi.org/10.1609/aimag.v37i1.2641}, url = {http://www.aaai.org/ojs/index.php/aimagazine/article/view/2641}, author = {Tomaso Poggio and Ethan Meyers} } @article {2194, title = {Uncovering representations of sleep-associated hippocampal ensemble spike activity}, journal = {Scientific Reports}, volume = {6}, year = {2016}, month = {08/2016}, abstract = {

Pyramidal neurons in the rodent hippocampus exhibit spatial tuning during spatial navigation, and they are reactivated in specific temporal order during sharp-wave ripples observed in quiet wakefulness or slow wave sleep. However, analyzing representations of sleep-associated hippocampal ensemble spike activity remains a great challenge. In contrast to wake, during sleep there is a complete absence of animal behavior, and the ensemble spike activity is sparse (low occurrence) and fragmental in time. To examine important issues encountered in sleep data analysis, we constructed synthetic sleep-like hippocampal spike data (short epochs, sparse and sporadic firing, compressed timescale) for detailed investigations. Based upon two Bayesian population-decoding methods (one receptive field-based, and the other not), we systematically investigated their representation power and detection reliability. Notably, the receptive-field-free decoding method was found to be well-tuned for hippocampal ensemble spike data in slow wave sleep (SWS), even in the absence of prior behavioral measure or ground truth. Our results showed that in addition to the sample length, bin size, and firing rate, number of active hippocampal pyramidal neurons are critical for reliable representation of the space as well as for detection of spatiotemporal reactivated patterns in SWS or quiet wakefulness.

}, doi = {10.1038/srep32193}, url = {http://dx.doi.org/10.1038/srep32193}, author = {Zhe Chen and Andres D. Grosmark and Hector Penagos and Matthew A. Wilson} } @proceedings {1724, title = {Understanding "almost": Empirical and computational studies of near misses}, year = {2016}, publisher = {38th Annual Meeting of the Cognitive Science Society}, author = {Tobias Gerstenberg and Joshua B. Tenenbaum} } @article {2134, title = {Universal Dependencies for Learner English}, year = {2016}, month = {06/2016}, abstract = {

We introduce the Treebank of Learner English (TLE), the first publicly available syntactic treebank for English as a Second Language (ESL). The TLE provides manually annotated POS tags and Universal Dependency (UD) trees for 5,124 sentences from the Cambridge First Certificate in English (FCE) corpus. The UD annotations are tied to a pre-existing error annotation of the FCE, whereby full syntactic analyses are provided for both the original and error corrected versions of each sentence. Further on, we delineate ESL annotation guidelines that allow for consistent syntactic treatment of ungrammatical English. Finally, we benchmark POS tagging and dependency parsing performance on the TLE dataset and measure the effect of grammatical errors on parsing accuracy. We envision the treebank to support a wide range of linguistic and computational research o n second language acquisition as well as automatic processing of ungrammatical language.

}, author = {Yevgeni Berzak and Jessica Kenney and Carolyn Spadine and Jing Xian Wang and Lucia Lam and Keiko Sophie Mori and Sebastian Garza and Boris Katz} } @conference {1740, title = {Unsupervised Learning of Visual Structure using Predictive Generative Networks}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2016}, month = {May 2016}, address = {San Juan, Puerto Rico}, abstract = {

The ability to predict future states of the environment is a central pillar of intelligence. At its core, effective prediction requires an internal model of the world and an understanding of the rules by which the world changes. Here, we explore the internal models developed by deep neural networks trained using a loss based on predicting future frames in synthetic video sequences, using a CNN-LSTM-deCNN framework. We first show that this architecture can achieve excellent performance in visual sequence prediction tasks, including state-of-the-art performance in a standard {\textquoteright}bouncing balls{\textquoteright} dataset (Sutskever et al., 2009). Using a weighted mean-squared error and adversarial loss (Goodfellow et al., 2014), the same architecture successfully extrapolates out-of-the-plane rotations of computer-generated faces. Furthermore, despite being trained end-to-end to predict only pixel-level information, our Predictive Generative Networks learn a representation of the latent structure of the underlying three-dimensional objects themselves. Importantly, we find that this representation is naturally tolerant to object transformations, and generalizes well to new tasks, such as classification of static images. Similar models trained solely with a reconstruction loss fail to generalize as effectively. We argue that prediction can serve as a powerful unsupervised loss for learning rich internal representations of high-level object features.

}, url = {http://arxiv.org/pdf/1511.06380v2.pdf}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {1777, title = {VerbCorner: Testing theories of argument structure through crowdsourcing}, year = {2016}, author = {J. K. Hartshorne}, editor = {M. Palmer} } @article {2122, title = {View-tolerant face recognition and Hebbian learning imply mirror-symmetric neural tuning to head orientation}, year = {2016}, month = {06/2016}, abstract = {

The primate brain contains a hierarchy of visual areas, dubbed the ventral stream, which rapidly computes object representations that are both specific for object identity and relatively robust against identity-preserving transformations like depth-rotations [ 33 , 32 , 23 , 13 ]. Current computational models of object recognition, including recent deep learning networks, generate these properties through a hierarchy of alternating selectivity-increasing filtering and tolerance-increasing pooling operations, similar to simple-complex cells operations [ 46 , 8 , 44 , 29 ]. While simulations of these models recapitulate the ventral stream{\textquoteright}s progression from early view-specific to late view-tolerant representations, they fail to generate the most salient property of the intermediate representation for faces found in the brain: mirror-symmetric tuning of the neural population to head orientation [ 16 ]. Here we prove that a class of hierarchical architectures and a broad set of biologically plausible learning rules can provide approximate invariance at the top level of the network. While most of the learning rules do not yield mirror-symmetry in the mid-level representations, we characterize a specific biologically-plausible Hebb-type learning rule that is guaranteed to generate mirror-symmetric tuning to faces tuning at intermediate levels of the architecture.

}, author = {JZ. Leibo and Qianli Liao and W. A. Freiwald and F. Anselmi and Tomaso Poggio} } @conference {2547, title = {Visual Concept Recognition and Localization via Iterative Introspection. }, booktitle = {Asian Conference on Computer Vision}, year = {2016}, month = {11/2016}, author = {Amir Rosenfeld and Shimon Ullman} } @book {2207, title = {Visual Cortex and Deep Networks: Learning Invariant Representations}, year = {2016}, month = {09/2016}, pages = {136}, publisher = {The MIT Press}, organization = {The MIT Press}, address = {Cambridge, MA, USA}, abstract = {

The ventral visual stream is believed to underlie object recognition in primates. Over the past fifty years, researchers have developed a series of quantitative models that are increasingly faithful to the biological architecture. Recently, deep learning convolution networks{\textemdash}which do not reflect several important features of the ventral stream architecture and physiology{\textemdash}have been trained with extremely large datasets, resulting in model neurons that mimic object recognition but do not explain the nature of the computations carried out in the ventral stream. This book develops a mathematical framework that describes learning of invariant representations of the ventral stream and is particularly relevant to deep convolutional learning networks.

The authors propose a theory based on the hypothesis that the main computational goal of the ventral stream is to compute neural representations of images that are invariant to transformations commonly encountered in the visual environment and are learned from unsupervised experience. They describe a general theoretical framework of a computational theory of invariance (with details and proofs offered in appendixes) and then review the application of the theory to the feedforward path of the ventral stream in the primate visual cortex.

}, isbn = {Hardcover: 9780262034722 | eBook: 9780262336703}, url = {https://mitpress.mit.edu/books/visual-cortex-and-deep-networks}, author = {Tomaso Poggio and F. Anselmi} } @conference {2747, title = {Visually indicated sounds}, booktitle = {Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, author = {Owens, Andrew and Isola, P. and Josh H. McDermott and Torralba, Antonio and Adelson, Edward H. and William T. Freeman} } @article {2608, title = {Welfare-tradeoff ratios in children}, year = {2016}, address = {Vancouver, Canada}, author = {A C Spokes and Howard, R and S A Mehr and Krasnow, M M} } @article {2645, title = {When Does Model-Based Control Pay Off?}, journal = {PLoS Comput Biol}, volume = {12}, year = {2016}, month = {2016 Aug}, pages = {e1005090}, abstract = {

Many accounts of decision making and reinforcement learning posit the existence of two distinct systems that control choice: a fast, automatic system and a slow, deliberative system. Recent research formalizes this distinction by mapping these systems to "model-free" and "model-based" strategies in reinforcement learning. Model-free strategies are computationally cheap, but sometimes inaccurate, because action values can be accessed by inspecting a look-up table constructed through trial-and-error. In contrast, model-based strategies compute action values through planning in a causal model of the environment, which is more accurate but also more cognitively demanding. It is assumed that this trade-off between accuracy and computational demand plays an important role in the arbitration between the two strategies, but we show that the hallmark task for dissociating model-free and model-based strategies, as well as several related variants, do not embody such a trade-off. We describe five factors that reduce the effectiveness of the model-based strategy on these tasks by reducing its accuracy in estimating reward outcomes and decreasing the importance of its choices. Based on these observations, we describe a version of the task that formally and empirically obtains an accuracy-demand trade-off between model-free and model-based strategies. Moreover, we show that human participants spontaneously increase their reliance on model-based control on this task, compared to the original paradigm. Our novel task and our computational analyses may prove important in subsequent empirical investigations of how humans balance accuracy and demand.

}, issn = {1553-7358}, doi = {10.1371/journal.pcbi.1005090}, author = {Kool, Wouter and Fiery A Cushman and Samuel J Gershman} } @article {2257, title = {Where do hypotheses come from?}, year = {2016}, month = {10/2016}, abstract = {

Why are human inferences sometimes remarkably close to the Bayesian ideal and other times systematically biased? One notable instance of this discrepancy is that tasks where the candidate hypotheses are explicitly available result in close to rational inference over the hypothesis space, whereas tasks requiring the self-generation of hypotheses produce systematic deviations from rational inference. We propose that these deviations arise from algorithmic processes approximating Bayes{\textquoteright} rule. Specifically in our account, hypotheses are generated stochastically from a sampling process, such that the sampled hypotheses form a Monte Carlo approximation of the posterior. While this approximation will converge to the true posterior in the limit of infinite samples, we take a small number of samples as we expect that the number of samples humans take is limited by time pressure and cognitive resource constraints. We show that this model recreates several well-documented experimental findings such as anchoring and adjustment, subadditivity, superadditivity, the crowd within as well as the self-generation effect, the weak evidence, and the dud alternative effects. Additionally, we confirm the model{\textquoteright}s prediction that superadditivity and subadditivity can be induced within the same paradigm by manipulating the unpacking and typicality of hypotheses, in 2 experiments.

}, author = {Ishita Dasgupta and Eric Schulz and Samuel J Gershman} } @article {2596, title = {Young Children{\textquoteright}s Use of Surface and Object Information in Drawings of Everyday Scenes}, journal = {Child Development}, year = {2016}, doi = {10.1111/cdev.12658}, author = {Moira R Dillon and Elizabeth S Spelke} } @conference {2553, title = {Zoom better to see clearer: Human and object parsing with hierarchical auto-zoom net}, booktitle = {ECCV}, year = {2016}, month = {09/2016}, address = {Amsterdam, The Netherlands}, author = {Fangting Xia and Peng Wang and Liang-chieh Chen and Alan Yuille} } @conference {2582, title = {Zoom Better to See Clearer: Human Part Segmentation with Auto Zoom Net}, booktitle = {ECCV}, year = {2016}, abstract = {

Parsing articulated objects, e.g . humans and animals, into semantic parts ( e.g . body, head and arms, etc .) from natural images is a challenging and fundamental problem for computer vision. A big dif- ficulty is the large variability of scale and location for objects and their corresponding parts. Even limited mistakes in estimating scale and loca- tion will degrade the parsing output and cause errors in boundary details. To tackle these difficulties, we propose a {\textquotedblleft}Hierarchical Auto-Zoom Net{\textquotedblright} (HAZN) for object part parsing which adapts to the local scales of ob- jects and parts. HAZN is a sequence of two {\textquotedblleft}Auto-Zoom Nets{\textquotedblright} (AZNs), each employing fully convolutional networks that perform two tasks: (1) predict the locations and scales of object instances (the first AZN) or their parts (the second AZN); (2) estimate the part scores for predicted object instance or part regions. Our model can adaptively {\textquotedblleft}zoom{\textquotedblright} (re- size) predicted image regions into their proper scales to refine the parsing. We conduct extensive experiments over the PASCAL part datasets on humans, horses, and cows. For humans, our approach significantly out- performs the state-of-the-arts by 5\% mIOU and is especially better at segmenting small instances and small parts. We obtain similar improve- ments for parsing cows and horses over alternative methods. In summary, our strategy of first zooming into objects and then zooming into parts is very effective. It also enables us to process different regions of the image at different scales adaptively so that, for example, we do not need to waste computational resources scaling the entire image.

}, author = {Fangting Xia and Peng Wang and Liang-chieh Chen and Alan Yuille} } @article {2655, title = {Assessing the precision of gaze following using a stereoscopic 3D virtual reality setting.}, journal = {Vision Res}, volume = {112}, year = {2015}, month = {2015 Jul}, pages = {68-82}, abstract = {

Despite the ecological importance of gaze following, little is known about the underlying neuronal processes, which allow us to extract gaze direction from the geometric features of the eye and head of a conspecific. In order to understand the neuronal mechanisms underlying this ability, a careful description of the capacity and the limitations of gaze following at the behavioral level is needed. Previous studies of gaze following, which relied on naturalistic settings have the disadvantage of allowing only very limited control of potentially relevant visual features guiding gaze following, such as the contrast of iris and sclera, the shape of the eyelids and--in the case of photographs--they lack depth. Hence, in order to get full control of potentially relevant features we decided to study gaze following of human observers guided by the gaze of a human avatar seen stereoscopically. To this end we established a stereoscopic 3D virtual reality setup, in which we tested human subjects{\textquoteright} abilities to detect at which target a human avatar was looking at. Following the gaze of the avatar showed all the features of the gaze following of a natural person, namely a substantial degree of precision associated with a consistent pattern of systematic deviations from the target. Poor stereo vision affected performance surprisingly little (only in certain experimental conditions). Only gaze following guided by targets at larger downward eccentricities exhibited a differential effect of the presence or absence of accompanying movements of the avatar{\textquoteright}s eyelids and eyebrows.

}, keywords = {Adult, Attention, Computer Simulation, Cues, Discrimination (Psychology), Eye Movements, Eyebrows, Eyelids, Female, Humans, Judgment, Male, Middle Aged, Young Adult}, issn = {1878-5646}, doi = {10.1016/j.visres.2015.04.015}, author = {Atabaki, Artin and Marciniak, Karolina and Dicke, Peter W and Thier, Peter} } @article {1759, title = {Canonical genetic signatures of the adult human brain}, journal = {Nature Neuroscience}, volume = {18}, year = {2015}, pages = {1844}, chapter = {1832}, abstract = {
The structure and function of the human brain are highly stereotyped, implying a conserved molecular program responsible for its development, cellular structure and function. We applied a correlation-based metric called differential stability to assess reproducibility of gene expression patterning across 132 structures in six individual brains, revealing mesoscale genetic organization. The genes with the highest differential stability are highly biologically relevant, with enrichment for brain-related annotations, disease associations, drug targets and literature citations. Using genes with high differential stability, we identified 32 anatomically diverse and reproducible gene expression signatures, which represent distinct cell types, intracellular components and/or associations with neurodevelopmental and neurodegenerative disorders. Genes in neuron-associated compared to non-neuronal networks showed higher preservation between human and mouse; however, many diversely patterned genes displayed marked shifts in regulation between species. Finally, highly consistent transcriptional architecture in neocortex is correlated with resting state functional connectivity, suggesting a link between conserved gene expression and functionally relevant circuitry
}, author = {Michael Hawrylycz and Jeremy A Miller and Vilas Menon and David Feng and Tim Dolbeare and Angela L Guillozet-Bongaarts and Anil G Jegga and Bruce J Aronow and Chang-Kyu Lee and Amy Bernard and Matthew F Glasser and Donna L Dierker and J{\"o}rge Menche and Aaron Szafer and Forrest Collman and Pascal Grange and Kenneth A Berman and Stefan Mihalas and Zizhen Yao and Lance Stewart and Albert-L{\'a}szl{\'o} Barab{\'a}si and Jay Schulkin and John Phillips and Lydia Ng and Chinh Dang and David R Haynor and Allan Jones and David C Van Essen and Christof Koch and Ed Lein} } @article {1776, title = {The causes and consequences explicit in verbs}, journal = {Language, Cognition and Neuroscience}, volume = {30}, year = {2015}, month = {02/09/2015}, pages = {716-734}, abstract = {
Interpretation of a pronoun in one clause can be systematically affected by the verb in the previous clause. Compare Archibald angered Bartholomew because he ...(he = Archibald) with Archibald criticised Bartholomew because he ...(he = Bartholomew). While it is clear that meaning plays a critical role, it is unclear whether that meaning is directly encodedin the verb or, alternatively, inferred from world knowledge. We report evidence favouring the former account. We elicitedpronoun biases for 502 verbs from seven Levin verb classes in two discourse contexts (implicit causality and implicitconsequentiality), showing that in both contexts, verb class reliably predicts pronoun bias. These results confirm and extendrecent findings about implicit causality and represent the first such study for implicit consequentiality. We discuss these findingsin the context of recent work in semantics, and also develop a new, probabilistic generative account of pronoun interpretation.
}, issn = {2327-3798 (Print) 2327-3801 (Online)}, doi = {10.1080/23273798.2015.1008524}, url = {http://dx.doi.org/10.1080/23273798.2015.1008524}, author = {J. K. Hartshorne}, editor = {T. J. O{\textquoteright}Donnell and Joshua B. Tenenbaum} } @article {1194, title = {Children{\textquoteright}s understanding of the costs and rewards underlying rational action}, journal = {Cognition}, volume = {140}, year = {2015}, month = {07/2015}, pages = {14{\textendash}23}, abstract = {

Humans explain and predict other agents{\textquoteright} behavior using mental state concepts, such as beliefs and desires. Computational and developmental evidence suggest that such inferences are enabled by a principle of rational action: the expectation that agents act efficiently, within situational constraints, to achieve their goals. Here we propose that the expectation of rational action is instantiated by a na{\"\i}ve utility calculus sensitive to both agent-constant and agent-specific aspects of costs and rewards associated with actions. In four experiments, we show that, given an agent{\textquoteright}s choices, children (range: 5-6 year olds; N=96) can infer unobservable aspects of costs (differences in agents{\textquoteright} competence) from information about subjective differences in rewards (differences in agents{\textquoteright} preferences) and vice versa. Moreover, children can design informative experiments on both objects and agents to infer unobservable constraints on agents{\textquoteright} actions.

}, doi = {10.1016/j.cognition.2015.03.006}, url = {http://www.sciencedirect.com/science/article/pii/S0010027715000566}, author = {Julian Jara-Ettinger and Hyowon Gweon and Joshua B. Tenenbaum and Laura Schulz} } @article {1668, title = {Children{\textquoteright}s expectations about training the approximate number system.}, journal = {British Journal of Developmental Psychology}, volume = {33}, year = {2015}, chapter = {411}, abstract = {

Humans possess a developmentally precocious and evolutionarily ancient Approximate Number System (ANS) whose sensitivity correlates with uniquely human symbolic arithmetic skills. Recent studies suggest that ANS training improves symbolic arithmetic, but such studies may engender performance expectations in their participants that in turn produce the improvement. Here we assessed 6- to 8-year-old children{\textquoteright}s expectations about the effects of numerical and non-numerical magnitude training, as well as states of satiety and restfulness, in the context of a study linking children{\textquoteright}s ANS practice to their improved symbolic arithmetic. We found that children did not expect gains in symbolic arithmetic after exercising the ANS, though they did expect gains in ANS acuity after training on any magnitude task. Moreover, children expected gains in symbolic arithmetic after a good night{\textquoteright}s sleep and their favorite breakfast. Thus, children{\textquoteright}s improved symbolic arithmetic after ANS training cannot be explained by their expectations about that training.

}, author = {Moira R Dillon and Pires, A. C. and Hyde, D. C. and Elizabeth S Spelke} } @article {1354, title = {Complexity of Representation and Inference in Compositional Models with Part Sharing}, number = {031}, year = {2015}, month = {05/2015}, abstract = {This paper performs a complexity analysis of a class of serial and parallel compositional models of multiple objects and shows that they enable efficient representation and rapid inference. Compositional models are generative and represent objects in a hierarchically distributed manner in terms of parts and subparts, which are constructed recursively by part-subpart compositions. Parts are represented more coarsely at higher level of the hierarchy, so that the upper levels give coarse summary descriptions (e.g., there is a horse in the image) while the lower levels represents the details (e.g., the positions of the legs of the horse). This hierarchically distributed representation obeys the executive summary principle, meaning that a high level executive only requires a coarse summary description and can, if necessary, get more details by consulting lower level executives. The parts and subparts are organized in terms of hierarchical dictionaries which enables part sharing between different objects allowing efficient representation of many objects. The first main contribution of this paper is to show that compositional models can be mapped onto a parallel visual architecture similar to that used by bio-inspired visual models such as deep convolutional networks but more explicit in terms of representation, hence enabling part detection as well as object detection, and suitable for complexity analysis. Inference algorithms can be run on this architecture to exploit the gains caused by part sharing and executive summary. Effectively, this compositional architecture enables us to perform exact inference simultaneously over a large class of generative models of objects.The second contribution is an analysis of the complexity of compositional models in terms of computation time (for serial computers) and numbers of nodes (e.g., {\textquoteleft}{\textquoteleft}neurons") for parallel computers. In particular, we compute the complexity gains by part sharing and executive summary and their dependence on how the dictionary scales with the level of the hierarchy. We explore three regimes of scaling behavior where the dictionary size (i) increases exponentially with the level of the hierarchy, (ii) is determined by an unsupervised compositional learning algorithm applied to real data, (iii) decreases exponentially with scale. This analysis shows that in some regimes the use of shared parts enables algorithms which can perform inference in time linear in the number of levels for an exponential number of objects. In other regimes part sharing has little advantage for serial computers but can enable linear processing on parallel computers.}, author = {Alan Yuille and Roozbeh Mottaghi} } @article {1092, title = {A Compositional Framework for Grounding Language Inference, Generation, and Acquisition in Video}, year = {2015}, abstract = {

We present an approach to simultaneously reasoning about a video clip and an entire natural-language sentence. The compositional nature of language is exploited to construct models which represent the meanings of entire sentences composed out of the meanings of the words in those sentences mediated by a grammar that encodes the predicate-argument relations. We demonstrate that these models faithfully represent the meanings of sentences and are sensitive to how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) affect the meaning of a sentence and how it is grounded in video. We exploit this methodology in three ways. In the first, a video clip along with a sentence are taken as input and the participants in the event described by the sentence are highlighted, even when the clip depicts multiple similar simultaneous events. In the second, a video clip is taken as input without a sentence and a sentence is generated that describes an event in that clip. In the third, a corpus of video clips is paired with sentences which describe some of the events in those clips and the meanings of the words in those sentences are learned. We learn these meanings without needing to specify which attribute of the video clips each word in a given sentence refers to. The learned meaning representations are shown to be intelligible to humans.

}, doi = {doi:10.1613/jair.4556}, url = {https://www.jair.org/media/4556/live-4556-8631-jair.pdf}, author = {Honan Yu and N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {1397, title = {Computational rationality: A converging paradigm for intelligence in brains, minds, and machines}, journal = {Science}, volume = {349}, year = {2015}, month = {07/17/2015}, pages = {273-278}, type = {Review; Special Section: Artificial Intelligence}, abstract = {
After growing up together, and mostly growing apart in the second half of the 20th century,the fields of artificial intelligence (AI), cognitive science, and neuroscience arereconverging on a shared view of the computational foundations of intelligence thatpromotes valuable cross-disciplinary exchanges on questions, methods, and results. We chart advances over the past several decades that address challenges of perceptionand action under uncertainty through the lens of computation. Advances include thedevelopment of representations and inferential procedures for large-scale probabilisticinference and machinery for enabling reflection and decisions about tradeoffs in effort, precision, and timeliness of computations. These tools are deployed toward the goal of computational rationality: identifying decisions with highest expected utility, while taking into consideration the costs of computation in complex real-world problems inwhich most relevant calculations can only be approximated. We highlight key concepts with examples that show the potential for interchange between computer science, cognitive science, and neuroscience.
}, doi = {10.1126/science.aac6076 }, url = {http://www.sciencemag.org/content/349/6245/273.abstract}, author = {Samuel J Gershman and Eric J. Horvitz and Joshua B. Tenenbaum} } @conference {1669, title = {Connecting core cognition, spatial symbols, and the abstract concepts of formal geometry.}, booktitle = {Cognitive Development Society Post-Conference, More on Development}, year = {2015}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {1009, title = {Consciousness: here, there and everywhere?}, journal = {Phil. Trans. Roy Society B}, volume = {370}, year = {2015}, abstract = {

The science of consciousness has made great strides by focusing on the behavioral and neuronal correlates of experience. However, while such correlates are important for progress to occur, they are not enough if we are to understand even basic facts, for example, why the cerebral cortex gives rise to consciousness but the cerebellum does not, though it has even more neurons and appears to be just as complicated. Moreover, correlates are of little help in many instances where we would like to know if consciousness is present: patients with a few remaining islands of functioning cortex, pre-term infants, non-mammalian species, and machines that are rapidly outperforming people at driving, recognizing faces and objects, and answering difficult questions. To address these issues, we need not only more data, but also a theory of consciousness {\textendash} one that says what experience is and what type of physical systems can have it. Integrated Information Theory (IIT) does so by starting from experience itself via five phenomenological axioms: intrinsic existence, composition, information, integration, and exclusion. From these it derives five postulates about the properties required of physical mechanisms to support consciousness. The theory provides a principled account of both the quantity and the quality of an individual experience (a quale), and a calculus to evaluate whether or not a particular physical system is conscious and of what. Moreover, IIT can explain a range of clinical and laboratory findings, makes a number of testable predictions, and extrapolates to a number of problematic conditions. The theory holds that consciousness is a fundamental property possessed by physical systems having specific causal properties. It predicts that consciousness is graded, is common among biological organisms, and can occur in some very simple systems. Conversely, it predicts that feed-forward networks, even complex ones, are not conscious, nor are aggregates such as groups of individuals or heaps of sand. Also, in sharp contrast with widespread functionalist beliefs, IIT implies that digital computers, even if their behavior were to be functionally equivalent to ours, and even if they were to run faithful simulations of the human brain, would experience next to nothing.

}, doi = {20140167}, url = {http://dx.doi.org/10.1098/rstb.2014.0167}, author = {Christof Koch and Tononi, Giulio} } @article {1007, title = {Contrasting Specializations for Facial Motion within the Macaque Face-Processing System}, journal = {Current Biology}, volume = {25}, year = {2015}, type = {Report}, chapter = {261}, abstract = {

Facial motion transmits rich and ethologically vital information
[1, 2], but how the brain interprets this complex signal is
poorly understood. Facial form is analyzed by anatomically
distinct face patches in the macaque brain [3, 4], and facial
motion activates these patches and surrounding areas [5,
6]. Yet, it is not known whether facial motion is processed
by its own distinct and specialized neural machinery, and if
so, what that machinery{\textquoteright}s organization might be. To address
these questions, we used fMRI to monitor the brain activity
of macaque monkeys while they viewed low- and high-level
motion and form stimuli. We found that, beyond classical
motion areas and the known face patch system, moving
faces recruited a heretofore unrecognized face patch.
Although all face patches displayed distinctive selectivity
for face motion over object motion, only two face patches
preferred naturally moving faces, while three others
preferred randomized, rapidly varying sequences of facial
form. This functional divide was anatomically specific,
segregating dorsal from ventral face patches, thereby
revealing a new organizational principle of the macaque
face-processing system.

}, author = {Clark Fisher and W. A. Freiwald} } @conference {1077, title = {Contrastive Analysis with Predictive Power: Typology Driven Estimation of Grammatical Error Distributions in ESL}, booktitle = {Nineteenth Conference on Computational Natural Language Learning (CoNLL), Beijing, China}, year = {2015}, month = {07/31/2015}, abstract = {

This work examines the impact of cross- linguistic transfer on grammatical errors in English as Second Language (ESL) texts. Using a computational framework that for- malizes the theory of Contrastive Analy- sis (CA), we demonstrate that language specific error distributions in ESL writ- ing can be predicted from the typologi- cal properties of the native language and their relation to the typology of English. Our typology driven model enables to ob- tain accurate estimates of such distribu- tions without access to any ESL data for the target languages. Furthermore, we present a strategy for adjusting our method to low-resource languages that lack typo- logical documentation using a bootstrap- ping approach which approximates native language typology from ESL texts. Fi- nally, we show that our framework is in- strumental for linguistic inquiry seeking to identify first language factors that con- tribute to a wide range of difficulties in second language acquisition

}, author = {Yevgeni Berzak and Roi Reichart and Boris Katz} } @article {989, title = {Decoding task and stimulus representation in face-responsive cortex}, year = {2015}, note = {

To be presented

}, author = {Dorit Kliemann and Nir Jacoby and Stefano Anzellottti and Rebecca Saxe} } @article {1155, title = {Decrease in gamma-band activity tracks sequence learning}, journal = {Frontiers in Systems Neuroscience}, volume = {8}, year = {2015}, month = {01/21/2015}, abstract = {

Learning novel sequences constitutes an example of declarative memory formation, involving conscious recall of temporal events. Performance in sequence learning tasks improves with repetition and involves forming temporal associations over scales of seconds to minutes. To further understand the neural circuits underlying declarative sequence learning over trials, we tracked changes in intracranial field potentials (IFPs) recorded from 1142 electrodes implanted throughout temporal and frontal cortical areas in 14 human subjects, while they learned the temporal-order of multiple sequences of images over trials through repeated recall. We observed an increase in power in the gamma frequency band (30{\textendash}100 Hz) in the recall phase, particularly in areas within the temporal lobe including the parahippocampal gyrus. The degree of this gamma power enhancement decreased over trials with improved sequence recall. Modulation of gamma power was directly correlated with the improvement in recall performance. When presenting new sequences, gamma power was reset to high values and decreased again after learning. These observations suggest that signals in the gamma frequency band may play a more prominent role during the early steps of the learning process rather than during the maintenance of memory traces.

}, doi = {10.3389/fnsys.2014.00222}, url = {http://journal.frontiersin.org/article/10.3389/fnsys.2014.00222/abstract}, author = {Radhika Madhavan and Daniel Millman and Hanlin Tang and NE Crone and Fredrick A. Lenz and Travis S Tierney and Joseph Madsen and Gabriel Kreiman and WS Anderson} } @article {1364, title = {Deep Captioning with Multimodal Recurrent Neural Networks (m-RNN)}, number = {033}, year = {2015}, month = {05/07/2015}, abstract = {

In this paper, we present a multimodal Recurrent Neural Network (m-RNN) model for generating novel image captions. It directly models the probability distribution of generating a word given previous words and an image. Image captions are generated according to this distribution. The model consists of two sub-networks: a deep recurrent neural network for sentences and a deep convolutional network for images. These two sub-networks interact with each other in a multimodal layer to form the whole m-RNN model. The effectiveness of our model is validated on four benchmark datasets (IAPR TC-12, Flickr 8K, Flickr 30K and MS COCO). Our model outperforms the state-of-the-art methods. In addition, the m-RNN model can be applied to retrieval tasks for retrieving images or sentences, and achieves significant performance improvement over the state-of-the-art methods which directly
optimize the ranking objective function for retrieval.

}, author = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Zhiheng Huang and Alan Yuille} } @article {1371, title = {Deep Convolutional Networks are Hierarchical Kernel Machines}, number = {035}, year = {2015}, month = {06/17/2015}, abstract = {

We extend i-theory to incorporate not only pooling but also rectifying nonlinearities in an extended HW module (eHW) designed for supervised learning. The two operations roughly correspond to invariance and selectivity, respectively. Under the assumption of normalized inputs, we show that appropriate linear combinations of rectifying nonlinearities are equivalent to radial kernels. If pooling is present an equivalent kernel also exist. Thus present-day DCNs (Deep Convolutional Networks) can be exactly equivalent to a hierarchy of kernel machines with pooling and non-pooling layers. Finally, we describe a conjecture for theoretically understanding hierarchies of such modules. A main consequence of the conjecture is that hierarchies of eHW modules minimize memory requirements while computing a selective and invariant representation.

}, author = {F. Anselmi and Lorenzo Rosasco and Cheston Tan and Tomaso Poggio} } @article {800, title = {Discovering hierarchical motion structure}, journal = {Vision Research}, volume = {Available online 26 March 2015}, year = {2015}, month = {03/2015}, abstract = {

Scenes filled with moving objects are often hierarchically organized: the motion of a migrating goose is nested within the flight pattern of its flock, the motion of a car is nested within the traffic pattern of other cars on the road, the motion of body parts are nested in the motion of the body. Humans perceive hierarchical structure even in stimuli with two or three moving dots. An influential theory of hierarchical motion perception holds that the visual system performs a "vector analysis" of moving objects, decomposing them into common and relative motions. However, this theory does not specify how to resolve ambiguity when a scene admits more than one vector analysis. We describe a Bayesian theory of vector analysis and show that it can account for classic results from dot motion experiments, as well as new experimental data. Our theory takes a step towards understanding how moving scenes are parsed into objects.

}, doi = {doi:10.1016/j.visres.2015.03.004}, url = {http://www.sciencedirect.com/science/article/pii/S0042698915000814$\#$sthash.vpJfuWmr.dpuf}, author = {Samuel J Gershman and Joshua B. Tenenbaum and Frank Jaekel} } @article {1096, title = {Discovering Switching Autoregressive Dynamics in Neural Spike Train Recordings}, year = {2015}, publisher = {Computational and Systems Neuroscience (Cosyne) Abstracts}, address = {Salt Lake City, UT, USA}, abstract = {

Generalized linear models (GLM) are powerful tools for identifying dependence in spiking populations of neurons, both over time and within the population (Paninski,\ 2004). The GLM identifies these dependencies by modeling spiking patterns through a linear regression and an appropriately-selected link function and likelihood. This regression setup is appealing for its simplicity, the wide variety of available priors, the potential for interpretability, and its computational efficiency. However, the GLM suffers from at least three notable deficiencies. First, the model is linear up to the link function, which only allows a limited range of response maps from neural spiking histories. Second, the model{\textquoteright}s parameters are fixed over time, while neural responses may vary due to processes that are exogenous to the population. Third, the generalized linear model presupposes a characteristic time scale for all dynamics, when there may be multiple, varying time scales of neural activity in a given population. Here we seek to address these deficiencies via a switching variant of the generalized linear model. A switching system is one that evolves through a set of discrete states over time, with each state exhibiting its own lowlevel dynamics. For example, the latent state of a hidden Markov model (HMM) can be used to determine the parameters of an autoregressive (AR) process. These HMM-AR models can be used to identify common patterns of linear dependence that vary over time. Bayesian nonparametric versions of HMM-AR models extend these ideas to allow for an infinite number of such patterns to exist a priori, and semi-Markov variants allow the different states to have idiosyncratic duration distributions. Here we develop GLM variants of these switching AR processes and specialize them for neural spiking data. In particular, we exploit recent data augmentation schemes for negative binomial likelihood functions (Pillow and Scott,\ 2012) to make inference tractable in HDP-HSMM-AR models with count-based observations.

}, author = {Matthew J. Johnson and Scott W. Linderman and Sandeep R. Datta and Ryan Adams} } @conference {1142, title = {Discriminative Template Learning in Group-Convolutional Networks for Invariant Speech Representations}, booktitle = {INTERSPEECH-2015}, year = {2015}, month = {09/2015}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Dresden, Germany}, url = {http://www.isca-speech.org/archive/interspeech_2015/i15_3229.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {1429, title = {Do You See What I Mean? Visual Resolution of Linguistic Ambiguities}, booktitle = {Conference on Empirical Methods in Natural Language Processing, Lisbon, Portugal. }, year = {2015}, month = {09/2015}, author = {Yevgeni Berzak and Andrei Barbu and Daniel Harari and Boris Katz and Shimon Ullman} } @article {3396, title = {Dynamics of 3D view invariance in monkey inferotemporal cortex}, journal = {Journal of Neurophysiology}, volume = {11319212373232821}, year = {2015}, month = {Jan-04-2015}, pages = {2180 - 2194}, issn = {0022-3077}, doi = {10.1152/jn.00810.2014}, url = {http://www.physiology.org/doi/10.1152/jn.00810.2014http://www.physiology.org/doi/pdf/10.1152/jn.00810.2014}, author = {N. Apurva Ratan Murty and Arun, Sripati P.} } @article {1957, title = {Early Reasoning about Affiliation and Caregiving}, number = {ID: 325/PS-III: 39}, year = {2015}, month = {01/2015}, abstract = {

Considerable\  research\  has\  examined\  infants{\textquoteright}\  reasoning\  about\  and\  evaluations\  of\  social\  agents,\  but\  two\  questions\  remain\  unanswered: First, do infants organize observed social relations into larger structures, inferring the relationship between t wo social\  beings\  based\  on\  their\  relations\  to\  a\  third\  party?\  Second,\  how\  do\  infants\  reaso n\  about\  a\  type\  of\  social\  relations\  prominent\  in\  all\  societies:\  kinship\  relations\  that\  modulate\  caregiving?\  In\  a\  series\  of\  experiments\  using\  animated\  events,\  we\  ask\  whether\  9 - ,\  11 - ,\  and 15 - to 18 - month - old infants expect two babies\  who\  were comforted by the same caregiver, or two caregivers\  who comforted\  the same baby, to affiliate with one another. We find that older infants make these inferences in a caregiving context, but n ot in a\  different context involving social interactions among adults. Thus, infant s are sensitive to at least one aspect of kinship relations {\textemdash} caregiving {\textemdash} and organize these relations into larger social structures.

}, author = {A C Spokes and Elizabeth S Spelke} } @article {1344, title = {Early reasoning about affiliation and kinship.}, year = {2015}, month = {03/2015}, publisher = {Poster presentation at the Biennial Meeting of the Society for Research in Child Development, Philadelphia, PA}, author = {A C Spokes and Elizabeth S Spelke} } @conference {1045, title = {Efficient and robust analysis-by-synthesis in vision: A computational framework, behavioral tests, and modeling neuronal representations}, booktitle = {Annual Conference of the Cognitive Science Society}, year = {2015}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} } @article {358, title = {Expansion microscopy}, journal = {Science}, volume = {347 }, year = {2015}, month = {01/30/2015}, pages = {543-548 }, abstract = {

In optical microscopy, fine structural details are resolved by using refraction to magnify images of a specimen. We discovered that, by synthesizing a swellable polymer network within a specimen, it can be physically expanded, resulting in physical magnification. By covalently anchoring specific labels located within the specimen directly to the polymer network, labels spaced closer than the optical diffraction limit can be isotropically separated and optically resolved, a process we call expansion microscopy (ExM). Thus, this process can be used to perform scalable super-resolution microscopy with diffraction-limited microscopes. We demonstrate ExM with apparent ~70 nm lateral resolution in both cultured cells and brain tissue, performing three-color super-resolution imaging of ~107 μm3 of the mouse hippocampus with a conventional confocal microscope.

}, issn = {0036-8075}, doi = {10.1126/science.1260088}, url = {http://www.sciencemag.org/cgi/doi/10.1126/science.1260088}, author = {Fei Chen and Paul W. Tillberg and Edward S Boyden} } @article {1726, title = {Face Patch Resting State Networks Link Face Processing to Social Cognition.}, journal = {PLoS Biology}, volume = {13}, year = {2015}, month = {09/2015}, pages = {e1002245}, abstract = {

Faces transmit a wealth of social information. How this information is exchanged between face-processing centers and brain areas supporting social cognition remains largely unclear. Here we identify these routes using resting state functional magnetic resonance imaging in macaque monkeys. We find that face areas functionally connect to specific regions within frontal, temporal, and parietal cortices, as well as subcortical structures supporting emotive, mnemonic, and cognitive functions. This establishes the existence of an extended face-recognition system in the macaque. Furthermore, the face patch resting state networks and the default mode network in monkeys show a pattern of overlap akin to that between the social brain and the default mode network in humans: this overlap specifically includes the posterior superior temporal sulcus, medial parietal, and dorsomedial prefrontal cortex, areas supporting high-level social cognition in humans. Together, these results reveal the embedding of face areas into larger brain networks and suggest that the resting state networks of the face patch system offer a new, easily accessible venue into the functional organization of the social brain and into the evolution of possibly uniquely human social skills.

}, keywords = {Face recognition, neural nerworks, prefrontal cortex, social cognition}, issn = {1545-7885}, doi = {10.1371/journal.pbio.1002245}, url = {http://journals.plos.org/plosbiology/article?id=10.1371/journal.pbio.1002245}, author = {Schwiedrzik, Caspar M and Wilbert Zarco and Everling, Stefan and W. A. Freiwald} } @proceedings {1204, title = {A fine-grained understanding of emotions: Young children match within-valence emotional expressions to their causes}, year = {2015}, month = {07/2015}, pages = {2685-2690}, author = {Yang Wu and Laura Schulz} } @article {1670, title = {From spatial symbols to Euclidean intuitions.}, year = {2015}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {1955, title = {From spatial symbols to Euclidean intuitions}, number = {ID: 316 / PS - I: 50}, year = {2015}, month = {10/2015}, address = {Columbus, OH}, abstract = {

Euclidean\  geometry\  is\  highly\  intuitive\  to\  adults\  from\  diverse\  cultures,\  but\  the\  sources\  of\  these\  intuitions\  remain\  unknown.\  The\  present study investigates whether children{\textquoteright}s understanding of Euclidean geometry is linked to their use of spatial symbols.\  Six - ,\  10,\  and\  12 - year - old\  children\  were\  given\  tests\  of\  navigation\  by\  purely\  geometric\  maps, which\  required\  them\  to\  place\  objects\  in\  fragmented\  3D\  environments\  using\  2D\  maps\  highlighting\  the\  same\  or\  different\  geometric\  information\  as\  the\  3D\  environments.\  Children\  also\  completed\  a\  test\  of\  abstract\  geometric\  reasoning\  focused\  on\  triangle\  completion .\  Performance\  on\  the\  geometric\  reasoning\  test\  improved\  markedly\  with\  age,\  and\  this\  improvement\  was\  associated\  with\  more\  integrated\  interpretations\  of\  the\  geometric\  maps\  and\  environments.\  These\  findings\  connect\  the\  achievement\  of\  Euclidean\  intuitions\  to\  the mastery\  of\  spatial\  symbols.

}, url = {http://cogdevsoc.org/sites/default/files/Official\%20Full\%20Conference\%20Proceedings_10.10.pdf}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {1751, title = {Functional organization of social perception and cognition in the superior temporal sulcus}, journal = {Cerebral Cortex}, volume = {25}, year = {2015}, month = {11/2015}, pages = {4596-4609}, abstract = {

The superior temporal sulcus (STS) is considered a hub for social perception and cognition, including the perception of faces and human motion, as well as understanding others{\textquoteright} actions, mental states, and language. However, the functional organization of the STS remains debated: Is this broad region composed of multiple functionally distinct modules, each specialized for a different process, or are STS subregions multifunctional, contributing to multiple processes? Is the STS spatially organized, and if so, what are the dominant features of this organization? We address these questions by measuring STS responses to a range of social and linguistic stimuli in the same set of human participants, using fMRI. We find a number of STS subregions that respond selectively to certain types of social input, organized along a posterior-to-anterior axis. We also identify regions of overlapping response to multiple contrasts, including regions responsive to both language and theory of mind, faces and voices, and faces and biological motion. Thus, the human STS contains both relatively domain-specific areas, and regions that respond to multiple types of social information.

}, doi = { 10.1093/cercor/bhv111}, url = {http://cercor.oxfordjournals.org/content/25/11/4596.full}, author = {Ben Deen and Kami Koldewyn and Nancy Kanwisher and Rebecca Saxe} } @article {1749, title = {Functional organization of the human superior temporal sulcus}, number = { Poster Number: 4380 }, year = {2015}, month = {6/2015}, address = {Honolulu, HI}, abstract = {

The human superior temporal sulcus (STS) has been implicated in a broad range of social perceptual and cognitive processes, including the perception of faces, biological motion, and vocal sounds, and the understanding of language and mental states. However, little is known about the overall functional organization of these responses. Does the STS contain distinct, specialized regions for processing different types of social information? Or is cortex in the STS largely multifunctional, with each region engaged in multiple different computations (Hein, 2008)? Because prior work has largely studied these processes independently, this question remains unanswered. Here, we first identify distinct functional subregions of the STS, and then examine their response to a broad range of social stimuli.

}, url = {https://ww4.aievolution.com/hbm1501/index.cfm?do=abs.viewAbs\&abs=3635}, author = {Ben Deen and Nancy Kanwisher and Rebecca Saxe} } @conference {1825, title = {Galileo: Perceiving physical object properties by integrating a physics engine with deep learning.}, booktitle = {NIPS 2015}, year = {2015}, address = { Montr{\'e}al, Canada}, abstract = {
Humans demonstrate remarkable abilities to predict physical events in dynamicscenes, and to infer the physical properties of objects from static images. We propose a generative model for solving these problems of physical scene understanding from real-world videos and images. At the core of our generative modelis a 3D physics engine, operating on an object-based representation of physical properties, including mass, position, 3D shape, and friction. We can infer these latent properties using relatively brief runs of MCMC, which drive simulations in
the physics engine to fit key features of visual observations. We further explore directly mapping visual inputs to physical properties, inverting a part of the generative process using deep learning. We name our model Galileo, and evaluate it on a video dataset with simple yet physically rich scenarios. Results show that Galileo is able to infer the physical properties of objects and predict the outcome of a variety of physical events, with an accuracy comparable to human subjects. Our study points towards an account of human vision with generative physical knowledge at its core, and various recognition models as helpers leading to efficient inference.
}, url = {https://papers.nips.cc/paper/5780-galileo-perceiving-physical-object-properties-by-integrating-a-physics-engine-with-deep-learning}, author = {Jiajun Wu and Ilker Yildirim and Joseph J. Lim and William T. Freeman and Joshua B. Tenenbaum} } @proceedings {793, title = {Graph Approximation and Clustering on a Budget}, volume = {38}, year = {2015}, abstract = {

We consider the problem of learning from a\  similarity matrix (such as spectral cluster-\  ing and low-dimensional embedding), when\  computing pairwise similarities are costly,\  and only a limited number of entries can be\  observed. We provide a theoretical anal-\  ysis using standard notions of graph ap-\  proximation, significantly generalizing pre-\  vious results, which focused on spectral\  clustering with two clusters. We also pro-\  pose a new algorithmic approach based on\  adaptive sampling, which experimentally\  matches or improves on previous methods,\  while being considerably more general and\  computationally cheaper.

}, author = {Ethan Fetaya and Ohad Shamir and Shimon Ullman} } @article {1508, title = {Holographic Embeddings of Knowledge Graphs}, year = {2015}, month = {11/16/2015}, abstract = {

Learning embeddings of entities and relations is an efficient and versatile method to perform machine learning on relational data such as knowledge graphs. In this work, we propose holographic embeddings (HolE) to learn compositional vector space representations of entire knowledge graphs. The proposed method is related to holographic models of associative memory in that it employs circular correlation to create compositional representations. By using correlation as the compositional operator, HolE can capture rich interactions but simultaneously remains efficient to compute, easy to train, and scalable to very large datasets. In extensive experiments we show that holographic embeddings are able to outperform state-of-the-art methods for link prediction in knowledge graphs and relational learning benchmark datasets.

}, keywords = {Associative Memory, Knowledge Graph, Machine Learning}, author = {Maximilian Nickel and Lorenzo Rosasco and Tomaso Poggio} } @article {1593, title = {How Important is Weight Symmetry in Backpropagation?}, year = {2015}, month = {11/29/2015}, abstract = {

Gradient backpropagation (BP) requires symmetric feedforward and feedback connections{\textemdash}the same weights must be used for forward and backward passes. This {\textquotedblleft}weight transport problem{\textquotedblright} [1] is thought to be one of the main reasons of BP{\textquoteright}s biological implausibility. Using 15 different classification datasets, we systematically study to what extent BP really depends on weight symmetry. In a study that turned out to be surprisingly similar in spirit to Lillicrap et al.{\textquoteright}s demonstration [2] but orthogonal in its results, our experiments indicate that: (1) the magnitudes of feedback weights do not matter to performance (2) the signs of feedback weights do matter{\textemdash}the more concordant signs between feedforward and their corresponding feedback connections, the better (3) with feedback weights having random magnitudes and 100\% concordant signs, we were able to achieve the same or even better performance than SGD. (4) some normalizations/stabilizations are indispensable for such asymmetric BP to work, namely Batch Normalization (BN) [3] and/or a {\textquotedblleft}Batch Manhattan{\textquotedblright} (BM) update rule.

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {1773, title = {How PFC and LIP process single and multiple-object {\textquoteleft}pop-out{\textquoteright} displays}, year = {2015}, abstract = {

Images in which one object is more salient than its surroundings lead to a {\textquoteleft}pop-out{\textquoteright} effect where subjects show very efficient behavioral responses to the salient object.\  This pop-out effect is present for displays in which: 1) a single object is on a blank background, and 2) a single object is highly distinct from other surrounding objects. Thus it is generally assumed that this pop-out effect arise from the same neural computations for both of these types of displays, and it is thought that this effect is mediated by {\textquotedblleft}bottom-up{\textquotedblright} attentional mechanisms.\ 

To directly examine whether these two types of displays are indeed processed the same way, we recorded neural activity in LIP and PFC which are two brain regions implicated in attentional processing. Using population decoding methods, in a population of 280 LIP and PFC neurons recorded from two monkeys we observed that when a single isolated object is displayed, information about the object{\textquoteright}s location appeared ~10 ms earlier in LIP than in PFC, which is consistent with a feed-forward account for processing isolated objects. However, when a salient object is presented among multiple distractor objects, information about the location of the salient object was delayed by 60-90 ms in both brain regions, and information now first appeared in PFC. Despite the differences in the latency of information between the two display types, the latency of population firing rate activity was similar for both types of displays. Additionally, we see that pattern of neural activity is very similar for both types of displays (and across different color transformations of the stimuli) indicating that information about the object{\textquoteright}s location is being coded in the same way regardless of display type. These results indicate that there is {\textquoteleft}top-down{\textquoteleft} neural component for processing pop-out displays, and that firing rate latencies can be quite distinct from the latency of when information first appear in a brain region.\ \ 

}, url = {https://www.sfn.org/~/media/SfN/Documents/Annual\%20Meeting/FinalProgram/NS2015/Full\%20Abstract\%20PDFs\%202015/SfN15_Abstracts_PDF_Nanos.ashx}, author = {Ethan Meyers}, editor = {Andy Liang and Christos Constantinidis} } @proceedings {755, title = {How, whether, why: Causal judgments as counterfactual contrasts}, year = {2015}, month = {07/22/2015}, pages = {782-787}, address = {Pasadena, CA}, issn = {978-0-9911967-2-2}, url = {https://mindmodeling.org/cogsci2015/papers/0142/index.html}, author = {Tobias Gerstenberg and Noah D. Goodman and D. A. Lagnado and Joshua B. Tenenbaum} } @article {1566, title = {Human-level concept learning through probabilistic program induction}, journal = {Science}, volume = {350}, year = {2015}, month = {12/11/2015}, pages = {1332-1338 }, abstract = {

People learning new concepts can often generalize successfully from just a single example, yet machine learning algorithms typically require tens or hundreds of examples to perform with similar accuracy. People can also use learned concepts in richer ways than conventional algorithms{\textemdash}for action, imagination, and explanation. We present a computational model that captures these human learning abilities for a large class of simple visual concepts: handwritten characters from the world{\textquoteright}s alphabets. The model represents concepts as simple programs that best explain observed examples under a Bayesian criterion. On a challenging one-shot classification task, the model achieves human-level performance while outperforming recent deep learning approaches. We also present several {\textquotedblleft}visual Turing tests{\textquotedblright} probing the model{\textquoteright}s creative generalization abilities, which in many cases are indistinguishable from human behavior.

}, keywords = {Machine Learning}, doi = {10.1126/science.aab3050 }, url = {http://www.sciencemag.org/content/350/6266/1332.short}, author = {Brenden M Lake and Salakhutdinov, Ruslan and Joshua B. Tenenbaum} } @proceedings {1205, title = {Hypothesis-Space Constraints in Causal Learning}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, url = {https://mindmodeling.org/cogsci2015/papers/0418/index.html}, author = {Pedro Tsividis and Joshua B. Tenenbaum and Laura Schulz} } @article {1195, title = {Imagination and the generation of new ideas}, journal = {Cognitive Development}, volume = {34}, year = {2015}, month = {April{\textendash}June 2015}, pages = {99{\textendash}110}, doi = {10.1016/j.cogdev.2014.12.008}, url = {http://www.sciencedirect.com/science/article/pii/S0885201414000744}, author = {Rachel Magid}, editor = {Mark Sheskin} } @article {1887, title = {The Infancy of the Human Brain}, journal = {Neuron}, volume = {88}, year = {2015}, month = {Jan-10-2015}, pages = {93 - 109}, issn = {08966273}, doi = {10.1016/j.neuron.2015.09.026}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0896627315008156}, author = {Dehaene-Lambertz, G. and Elizabeth S Spelke} } @article {1956, title = {Infants{\textquoteright} Categorization of Social Actions}, number = {ID: 476 / PS - II: 48}, year = {2015}, month = {10/2015}, address = {Columbus, OH}, abstract = {

Infants use information about efficiency to identify agents{\textquoteright} physical goals. But how do they recognize actions with social rather than physical functions? They may rely on an understanding that socially meaningful actions work not by efficiently enacting physical changes, but instead through shared use across group members. We found support for this hypothesis across several experiments that probed the conditions under which 8- and 9-month-old infants expect an action to be performed by additional members of the initial actor{\textquoteright}s social group. Infants generalized actions to new members of social groups if and only if the actions in question werenon-instrumental and infants had observed two socially related individuals repeating the action, whether or not they were members of the group across which generalization was tested. Thus, infants use characteristics of social behavior {\textendash} physical inefficiency and shared use by group members {\textendash} to categorize actions as social.

}, author = {Lindsey J Powell and Elizabeth S Spelke} } @article {1819, title = {Infants{\textquoteright} Reasoning about Affiliation and Caregiving}, year = {2015}, month = {10/2015}, address = {Columbus, Ohio}, author = {A C Spokes}, editor = {Elizabeth S Spelke} } @article {1817, title = {Infants{\textquoteright} Reasoning about Affiliation and Caregiving}, year = {2015}, month = {10/2015}, author = {A C Spokes}, editor = {Elizabeth S Spelke} } @article {1671, title = {Infants{\textquoteright} sensitivity to shape changes.}, year = {2015}, author = {Moira R Dillon and Izard, V. and Elizabeth S Spelke} } @article {1095, title = {Inferring structured connectivity from spike trains under negative-binomial generalized linear models}, year = {2015}, publisher = {Computational and Systems Neuroscience (Cosyne) Abstracts}, address = {Salt Lake City, UT, USA}, abstract = {

The steady expansion of neural recording capability provides exciting opportunities for discovering unexpected patterns and gaining new insights into neural computation. Realizing these gains requires flexible and accurate yet tractable statistical methods for extracting structure from large-scale neural recordings. Here we present a model for simultaneously recorded multi-neuron spike trains with negative binomial spiking and structured patterns of functional coupling between neurons. We use a generalized linear model (GLM) with negative-binomial observations to describe spike trains, which provides a flexible model for over-dispersed spike counts (i.e., responses with greater-than-Poisson variability), and introduce flexible priors over functional coupling kernels derived from sparse random network models. The coupling kernels capture dependencies between neurons by allowing spiking activity in each neuron to influence future spiking activity in its neighbors. However, these dependencies tend to be sparse, and to have additional structure that is not exploited by standard (e.g., group lasso) regularization methods. For example, neurons may belong to different classes, as is often found in the retina, or they may be characterized by a small number of features, such as a preferred stimulus selectivity. These latent variables lend interpretability to otherwise incomprehensible data. To incorporate these concepts, we decompose the coupling kernels with a weighted network, and leverage latent variable models like the Erd{\textacutedbl}os-Renyi model, stochastic block model, and the latent feature model as priors over the interactions. To perform inference, we exploit recent innovations in negative binomial regression to perform efficient, fully-Bayesian sampling of the posterior distribution over parameters given the data. This provides access to the full posterior distribution over connectivity, and allows underlying network variables to be inferred alongside the low-dimensional latent variables of each neuron. We apply the model to neural data from primate retina and show that it recovers interpretable patterns of interaction between different cell types.

}, author = {Scott W. Linderman and Ryan Adams and Jonathan Pillow} } @article {1811, title = {Information Selection in Noisy Environments with Large Action Spaces}, volume = {Columbus, OH}, year = {2015}, author = {Pedro Tsividis and Samuel J Gershman and Joshua B. Tenenbaum and Laura Schulz} } @article {1356, title = {Intelligent Information Loss: The Coding of Facial Identity, Head Pose, and Non-Face Information in the Macaque Face Patch System}, journal = {The Journal of Neuroscience }, volume = {35}, year = {2015}, month = {05/2015}, chapter = {7069}, abstract = {

Faces are a behaviorally important class of visual stimuli for primates. Recent work in macaque monkeys has identified six discrete face areas where most neurons have higher firing rates to images of faces compared with other objects (Tsao et al., 2006). While neurons in these areas appear to have different tuning (Freiwald and Tsao, 2010; Issa and DiCarlo, 2012), exactly what types of information and, consequently, which visual behaviors neural populations within each face area can support, is unknown. Here we use population decoding to better characterize three of these face patches (ML/MF, AL, and AM). We show that neural activity in all patches contains information that discriminates between the broad categories of face and nonface objects, individual faces, and nonface stimuli. Information is present in both high and lower firing rate regimes. However, there were significant differences between the patches, with the most anterior patch showing relatively weaker representation of nonface stimuli. Additionally, we find that pose-invariant face identity information increases as one moves to more anterior patches, while information about the orientation of the head decreases. Finally, we show that all the information we can extract from the population is present in patterns of activity across neurons, and there is relatively little information in the total activity of the population. These findings give new insight into the representations constructed by the face patch system and how they are successively transformed.

}, doi = {10.1523/JNEUROSCI.3086-14.2015}, url = {http://www.ncbi.nlm.nih.gov/pubmed/25948258}, author = {Ethan Meyers and Mia Borzello and W. A. Freiwald and Doris Tsao} } @article {695, title = {On Invariance and Selectivity in Representation Learning}, number = {029}, year = {2015}, month = {03/23/2015}, abstract = {

We discuss data representation which can be learned automatically from data, are invariant to transformations, and at the same time selective, in the sense that two points have the same representation only if they are one the transformation of the other. The mathematical results here sharpen some of the key claims of i-theory, a recent theory of feedforward processing in sensory cortex.

}, author = {F. Anselmi and Lorenzo Rosasco and Tomaso Poggio} } @article {1380, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, year = {2015}, month = {07/2015}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @article {1484, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, journal = {PLOS Computational Biology}, volume = {11}, year = {2015}, month = {10/23/2015}, pages = {e1004390}, abstract = {

Is visual cortex made up of general-purpose information processing machinery, or does it consist of a collection of specialized modules? If prior knowledge, acquired from learning a set of objects is only transferable to new objects that share properties with the old, then the recognition system{\textquoteright}s optimal organization must be one containing specialized modules for different object classes. Our analysis starts from a premise we call the invariance hypothesis: that the computational goal of the ventral stream is to compute an invariant-to-transformations and discriminative signature for recognition. The key condition enabling approximate transfer of invariance without sacrificing discriminability turns out to be that the learned and novel objects transform similarly. This implies that the optimal recognition system must contain subsystems trained only with data from similarly-transforming objects and suggests a novel interpretation of domain-specific regions like the fusiform face area (FFA). Furthermore, we can define an index of transformation-compatibility, computable from videos, that can be combined with information about the statistics of natural vision to yield predictions for which object categories ought to have domain-specific regions in agreement with the available data. The result is a unifying account linking the large literature on view-based recognition with the wealth of experimental evidence concerning domain-specific regions.

}, doi = {10.1371/journal.pcbi.1004390}, url = {http://dx.plos.org/10.1371/journal.pcbi.1004390}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @article {2559, title = {Invariant representations for action recognition in the visual system.}, volume = {15}, year = {2015}, address = {Journal of vision}, doi = {10.1167/15.12.558}, url = {http://jov.arvojournals.org/article.aspx?articleid=2433666}, author = {Andrea Tacchetti and Leyla Isik and Tomaso Poggio} } @article {2560, title = {Invariant representations for action recognition in the visual system}, year = {2015}, author = {Leyla Isik and Andrea Tacchetti and Tomaso Poggio} } @article {867, title = {Isolating angle in infants{\textquoteright} detection of shape}, year = {2015}, author = {Moira R Dillon and V{\'e}ronique Izard and Elizabeth S Spelke} } @article {1588, title = {I-theory on depth vs width: hierarchical function composition}, year = {2015}, month = {12/29/2015}, abstract = {

Deep learning networks with convolution, pooling and subsampling are a special case of hierarchical architectures, which can be represented by trees (such as binary trees). Hierarchical as well as shallow networks can approximate functions of several variables, in particular those that are compositions of low dimensional functions. We show that the power of a deep network architecture with respect to a shallow network is rather independent of the specific nonlinear operations in the network and depends instead on the the behavior of the VC-dimension. A shallow network can approximate compositional functions with the same error of a deep network but at the cost of a VC-dimension that is exponential instead than quadratic in the dimensionality of the function. To complete the argument we argue that there exist visual computations that are intrinsically compositional. In particular, we prove that recognition invariant to translation cannot be computed by shallow networks in the presence of clutter. Finally, a general framework that includes the compositional case is sketched. The key condition that allows tall, thin networks to be nicer that short, fat networks is that the target input-output function must be sparse in a certain technical sense.

}, author = {Tomaso Poggio and F. Anselmi and Lorenzo Rosasco} } @conference {1982, title = {Learning like a Child: Fast Novel Visual Concept Learning from Sentence Descriptions of Images}, booktitle = {International Conference of Computer Vision}, year = {2015}, month = {12/2015}, address = {Santiago, Chile}, abstract = {
In this paper, we address the task of learning novel visual concepts, and their interactions with other concepts, from a few images with sentence descriptions.
Using linguistic context and visual features, our method is able to efficiently hypothesize the semantic meaning of new words and add them to its word dictionary so that they can be used to describe images which contain these novel concepts.
Our method has an image captioning module based on m-RNN with several improvements.
In particular, we propose a transposed weight sharing scheme, which not only improves performance on image captioning, but also makes the model more suitable for the novel concept learning task.
We propose methods to prevent overfitting the new concepts. 
In addition, three novel concept datasets are constructed for this new task, and are publicly available on the project page.
In the experiments, we show that our method effectively learns novel visual concepts from a few examples without disturbing the previously learned concepts.
The project page is: \url{www.stat.ucla.edu/~junhua.mao/projects/child_learning.html}.
}, url = {www.stat.ucla.edu/~junhua.mao/projects/child_learning.html}, author = {Junhua Mao and Wei Xu and Yi Yang and Jiang Wang and Zhiheng Huang and Alan Yuille} } @conference {1574, title = {Learning with a Wasserstein Loss}, booktitle = {Advances in Neural Information Processing Systems (NIPS 2015) 28}, year = {2015}, abstract = {

Learning to predict multi-label outputs is challenging, but in many problems there is a natural metric on the outputs that can be used to improve predictions. In this paper we develop a loss function for multi-label learning, based on the Wasserstein distance. The Wasserstein distance provides a natural notion of dissimilarity for probability measures. Although optimizing with respect to the exact Wasserstein distance is costly, recent work has described a regularized approximation that is efficiently computed. We describe an efficient learning algorithm based on this regularization, as well as a novel extension of the Wasserstein distance from prob- ability measures to unnormalized measures. We also describe a statistical learning bound for the loss. The Wasserstein loss can encourage smoothness of the predic- tions with respect to a chosen metric on the output space. We demonstrate this property on a real-data tag prediction problem, using the Yahoo Flickr Creative Commons dataset, outperforming a baseline that doesn{\textquoteright}t use the metric.

}, url = {http://arxiv.org/abs/1506.05439}, author = {Charlie Frogner and Chiyuan Zhang and Hossein Mobahi and Mauricio Araya-Polo and Tomaso Poggio} } @conference {1559, title = {Learning with Group Invariant Features: A Kernel Perspective}, booktitle = {NIPS 2015}, year = {2015}, abstract = {
We analyze in this paper a random feature map based on a theory of invariance (I-theory) introduced in Anselmi et.al. 2013. More specifically, a group invariant signal signature is obtained through cumulative distributions of group-transformed random projections. Our analysis bridges invariant feature learning with kernel methods, as we show that this feature map defines an expected Haar-integration kernel that is invariant to the specified group action. We show how this non-linear random feature map approximates this group invariant kernel uniformly on a set of\ N\ points. Moreover, we show that it defines a function space that is dense in the equivalent Invariant Reproducing Kernel Hilbert Space. Finally, we quantify error rates of the convergence of the empirical risk minimization, as well as the reduction in the sample complexity of a learning algorithm using such an invariant representation for signal classification, in a classical supervised learning setting
}, url = {https://papers.nips.cc/paper/5798-learning-with-group-invariant-features-a-kernel-perspective}, author = {Youssef Mroueh and Stephen Voinea and Tomaso Poggio} } @conference {1573, title = {Learning with incremental iterative regularization}, booktitle = {NIPS 2015}, year = {2015}, abstract = {

Within a statistical learning setting, we propose and study an iterative regularization algorithm for least squares defined by an incremental gradient method. In particular, we show that, if all other parameters are fixed a priori, the number of passes over the data (epochs) acts as a regularization parameter, and prove strong universal consistency, i.e. almost sure convergence of the risk, as well as sharp finite sample bounds for the iterates. Our results are a step towards understanding the effect of multiple epochs in stochastic gradient techniques in machine learning and rely on integrating statistical and optimizationresults.

}, url = {https://papers.nips.cc/paper/6015-learning-with-incremental-iterative-regularization}, author = {Lorenzo Rosasco and Villa, Silvia} } @conference {1572, title = {Less is More: Nystr{\"o}m Computational Regularization}, booktitle = {NIPS 2015}, year = {2015}, abstract = {

We study Nystr"om type subsampling approaches to large scale kernel methods, and prove learning bounds in the statistical learning setting, where random sampling and high probability estimates are considered. In particular, we prove that these approaches can achieve optimal learning bounds, provided the subsampling level is suitably chosen. These results suggest a simple incremental variant of Nystr"om Kernel Regularized Least Squares, where the subsampling level implements a form of computational regularization, in the sense that it controls at the same time regularization and computations. Extensive experimental analysis shows that the considered approach achieves state of the art performances on benchmark large scale datasets.

}, url = {https://papers.nips.cc/paper/5936-less-is-more-nystrom-computational-regularization}, author = {Alessandro Rudi and Raffaello Camoriano and Lorenzo Rosasco} } @article {2061, title = {Lust and the Turing test [Nature] }, year = {2015}, month = {05/2015}, publisher = {Nature}, abstract = {

Dr. Christof Koch, Chief Scientific Officer, Allen Institute for Brains Science, wrote a recent blog post for Nature entitled "Lust and the Turing test."

"By and large, we watch movies to be entertained, not to be provoked into deep thought. Occasionally, a film does both. This year{\textquoteright}s Ex Machina is one such gem. It prompted me to reflect upon the evolution of the idea of machine sentience over the past three decades of science fiction on film.

I am a long-time student of the mind-body problem {\textemdash} how consciousness arises from the brain. There is a conundrum at the heart of this ancient dilemma, challenging both brain science and AI; and it is well captured by Ex Machina and two other SF movies. In essence, it lies in how we can ever be certain that a machine feels anything, is conscious."

}, url = {http://blogs.nature.com/aviewfromthebridge/2015/05/27/lust-and-the-turing-test/}, author = {Christof Koch} } @conference {1754, title = {Metareasoning in Symbolic Domains}, booktitle = {NIPS Workshop | Bounded Optimality and Rational Metareasoning}, year = {2015}, month = {12/2015}, abstract = {

Many AI problems, such as planning, grammar learning, program induction, and theory discovery, require searching in symbolic domains. Most models perform this search by evaluating a sequence of candidate solutions, generated in order by some heuristic. Human reasoning, though, is not limited to sequential trial and error.\  In particular, humans are able to reason about what the solution to a particular problem should look like, before comparing candidates against the data. In a program synthesis task, for instance, a human might first determine that the task at hand should be solved by a tail-recursive algorithm, before filling in the\  algorithm{\textquoteright}s details.

Reasoning in this way about solution structure confers at least two computational advantages. First, a given structure subsumes a potentially large collection of primitive solutions, and exploiting the constraints present in the structure{\textquoteright}s definition makes it possible to eval- uate the collection in substantially less time than it would take to evaluate each in turn. For example, a programmer might quickly conclude that a given algorithm cannot be implemented without recursion, without having to consider all possible non-recursive solutions. Second, it is often possible to estimate ahead of time the cost of evaluating different structures, making it possible to prioritize those that can be treated cheaply. In planning a route through an unfamiliar city, for example, one might first consider possibilities which use the subway exclusively, excluding for the moment ones that involve bus trips as well: if a successfully subway-only solution can be found, one then avoids the (potentially) exponentially more difficult bus-and-subway search problem.

Here, we consider a family of toy problems [1], in which an agent is given a balance scale, and is required to find a lighter counterfeit coin in a collection of genuine coins using at most some prescribed number of weighings. We develop a language for expressing solution structure\  that places restrictions on a set of programs,\  and use recent program synthesis techniques to search for a solution, encoded as a program, subject to hypothesized constraints on the program \ structure.

}, url = {https://sites.google.com/site/boundedoptimalityworkshop/}, author = {Kevin Ellis and Owen Lewis} } @proceedings {1762, title = {A model for full local image interpretation}, year = {2015}, abstract = {

We describe a computational model of humans{\textquoteright} ability to provide a detailed interpretation of a scene{\textquoteright}s components. Humans can identify in an image meaningful components almost everywhere, and identifying these components is an essential part of the visual process, and of understanding the surrounding scene and its potential meaning to the viewer. Detailed interpretation is beyond the scope of current models of visual recognition. Our model suggests that this is a fundamental limitation, related to the fact that existing models rely on feed - forward but limited top - down processing. In our model, a first recognition stage leads to the initial activation of class candidates, which is incomplete and with limited accuracy. This stage then triggers the application of class - specific interpretation and validation processes, which recover richer and more accurate interpretation of the visible scene. We discuss implications of the model for visual interpretation by humans and by computer vision models.

}, author = {Guy Ben-Yosef and Liav Assif and Daniel Harari and Shimon Ullman} } @conference {1870, title = {Model-based Story Summary}, booktitle = {6th Workshop on Computational Models of Narrative}, year = {2015}, month = {05/2015}, address = {Atlanta, Georgia}, abstract = {
A story summarizer benefits greatly from a reader model because a reader model enables the story summarizer to focus on delivering useful knowledge in minimal time with minimal effort. Such a summarizer can, in particular, eliminate disconnected story elements, deliver only story elements connected to conceptual content, focus on particular concepts of interest, such as revenge, and make use of our human tendency to see causal connection in adjacent sentences. Experiments with a summarizer, built on the Genesis story understanding system, demonstrate considerable compression of an 85-element pr{\'e}cis of the plot of Shakespeare{\textquoteright}s Macbeth, reducing it, for example, to the 14 elements that make it a concise summary about Pyrrhic victory. Refocusing the summarizer on regicide reduces the element count to 7, or 8\% of the original.
}, doi = {10.4230/OASIcs.CMN.2015.157}, url = {http://dx.doi.org/10.4230/OASIcs.CMN.2015.157}, author = {Patrick Henry Winston} } @article {2653, title = {Monkeys head-gaze following is fast, precise and not fully suppressible.}, journal = {Proc Biol Sci}, volume = {282}, year = {2015}, month = {2015 Oct 07}, pages = {20151020}, abstract = {

Human eye-gaze is a powerful stimulus, drawing the observer{\textquoteright}s attention to places and objects of interest to someone else ({\textquoteright}eye-gaze following{\textquoteright}). The largely homogeneous eyes of monkeys, compromising the assessment of eye-gaze by conspecifics from larger distances, explain the absence of comparable eye-gaze following in these animals. Yet, monkeys are able to use peer head orientation to shift attention ({\textquoteright}head-gaze following{\textquoteright}). How similar are monkeys{\textquoteright} head-gaze and human eye-gaze following? To address this question, we trained rhesus monkeys to make saccades to targets, either identified by the head-gaze of demonstrator monkeys or, alternatively, identified by learned associations between the demonstrators{\textquoteright} facial identities and the targets (gaze versus identity following). In a variant of this task that occurred at random, the instruction to follow head-gaze or identity was replaced in the course of a trial by the new rule to detect a change of luminance of one of the saccade targets. Although this change-of-rule rendered the demonstrator portraits irrelevant, they nevertheless influenced performance, reflecting a precise redistribution of spatial attention. The specific features depended on whether the initial rule was head-gaze or identity following: head-gaze caused an insuppressible shift of attention to the target gazed at by the demonstrator, whereas identity matching prompted much later shifts of attention, however, only if the initial rule had been identity following. Furthermore, shifts of attention prompted by head-gaze were spatially precise. Automaticity and swiftness, spatial precision and limited executive control characterizing monkeys{\textquoteright} head-gaze following are key features of human eye-gaze following. This similarity supports the notion that both may rely on the same conserved neural circuitry.

}, keywords = {Animals, Attention, Face, Head, learning, Macaca mulatta, Saccades, Vision, Ocular}, issn = {1471-2954}, doi = {10.1098/rspb.2015.1020}, author = {Marciniak, Karolina and Dicke, Peter W and Thier, Peter} } @article {839, title = {Moving the lab home: validation of a web-based system for developmental studies}, year = {2015}, abstract = {

Moving the lab home: validation of a web-based system for developmental studies

Many practical considerations affect the kinds of scientific questions typically pursued by developmental labs.\  Sample size is limited by the resources involved in participant recruitment and outreach, which constrains investigations to phenomena expected to manifest in most children and generate large condition differences. Special populations and longitudinal designs are often avoided outside of historically specialized labs because of the resources involved in recruitment and testing.\  These practical constraints limit our ability to establish small or graded effects and to learn about specific disorders, individual differences, and the effects of interventions.\ 

We present a novel online interface\  for infant and child recruitment and testing to enable large-scale participation in developmental studies, analogous to the kind of participation enabled by Amazon Mechanical Turk in adult cognitive science. \ While families complete a short, browser-based developmental study, webcam recording is conducted programmatically using a custom-written, freely available Javascript library which interfaces with Flash.

Initial tests of the online studies demonstrated that preferential looking is easily elicited and detected from a webcam recording (see Figure 1). Study 1 validated preferential looking measures by replicating Experiment 2 of Yuan \& Fisher (2009), which found that 2-year-olds could store information about the syntactic structure of a novel verb{\textemdash}even before learning what the verb meant.\  As in the original paper, two-year-olds who heard a novel transitive verb spent proportionately more of their time looking at two-participant events compared to those who heard an intransitive verb when asked to find the novel verb (interaction between transitivity and question type, p = 0.05; N = 48 2-year-olds tested either on either a computer-based, webcam-recorded protocol in the lab or an identical protocol online).\ 

Study 2 validated verbal response measures in preschoolers by replicating a study by Schulz, Bonawitz, \& Griffiths (2007) which demonstrated that by age 4-5 preschoolers take into account both prior beliefs and statistical evidence in attributing causal power; however, statistical evidence was taken into account by younger children only for initially plausible relationships. 64 3-year-olds (aged 36 to 42 months) and 105 4-year-olds (aged 48 to 60 months) have participated in a storybook-based online causal reasoning study.\  Their verbal responses are consistent with a qualitative replication of the age trend reported in Schulz et al. (2007) (see Figure 2).

Study 3 is a replication of work by Pasquini et al. (2007) on preschoolers{\textquoteright} sensitivity to informant accuracy in epistemic trust, further validating verbal response measures using video stimuli and a purely online population.\  Study 4 uses a replication of Teglas et al. (2007) to investigate differences in looking time measures online and in the lab.

We believe that most methods currently used in behavioral developmental research {\textendash} looking-time and preferential looking studies, forced choice questions, structured interviews, and reaching tasks {\textendash} can be transferred to the online environment. By reducing practical constraints on research with young children, we do not simply make life easier for researchers: we can expand the scope of the questions asked and make it more possible to run the scientifically right study to answer a question of interest.

}, author = {Kim Scott}, editor = {Laura Schulz} } @article {460, title = {Neural ensemble communities: open-source approaches to hardware for large-scale electrophysiology}, journal = {Current Opinion in Neurobiology}, volume = {32}, year = {2015}, month = {01/2015}, pages = {53 - 59}, abstract = {

One often-overlooked factor when selecting a platform for largescale electrophysiology is whether or not a particular data acquisition system is {\textquoteleft}open{\textquoteright} or {\textquoteleft}closed{\textquoteright}: that is, whether or not the system{\textquoteright}s schematics and source code are available to end users. Open systems have a reputation for being difficult to acquire, poorly documented, and hard to maintain. With the arrival of more powerful and compact integrated circuits, rapid prototyping services, and web-based tools for collaborative development, these stereotypes must be reconsidered. We discuss some of the reasons why multichannel extracellular electrophysiology could benefit from open-source approaches and describe examples of successful community-driven tool development within this field. In order to promote the adoption of open-source hardware and to reduce the need for redundant development efforts, we advocate a move toward standardized interfaces that connect each element of the data processing pipeline. This will give researchers the flexibility to modify their tools when necessary, while allowing them to continue to benefit from the high-quality products and expertise provided by commercial vendors.

Available online 17 December 2014

In Print:\  June 2015

}, issn = {09594388}, doi = {10.1016/j.conb.2014.11.004}, url = {http://www.sciencedirect.com/science/article/pii/S0959438814002268}, author = {Siegle, Joshua H and Gregory J Hale and Jonathan P Newman and Voigts, Jakob} } @article {1206, title = {Not So Innocent: Toddlers{\textquoteright} Inferences About Costs and Culpability}, journal = {Psychological Science }, volume = {26}, year = {2015}, month = {05/2015}, pages = {633-40}, abstract = {

Adults{\textquoteright} social evaluations are influenced by their perception of other people{\textquoteright}s competence and motivation: Helping when it is difficult to help is praiseworthy, and not helping when it is easy to help is reprehensible. Here, we look at whether children{\textquoteright}s social evaluations are affected by the costs that agents incur. We found that toddlers can use the time and effort associated with goal-directed actions to distinguish agents, and that children prefer agents who incur fewer costs in completing a goal. When two agents refuse to help, children retain a preference for the more competent agent but infer that the less competent agent is nicer. These results suggest that children value agents who incur fewer costs, but understand that failure to engage in a low-cost action implies a lack of motivation. We propose that a naive utility calculus underlies inferences from the costs and rewards of goal-directed action and thereby supports social cognition.

}, keywords = {cognitive development, open data, open materials, social cognition, theory of mind}, doi = {10.1177/0956797615572806}, url = {http://pss.sagepub.com/content/early/2015/04/09/0956797615572806}, author = {Julian Jara-Ettinger}, editor = {Joshua B. Tenenbaum and Laura Schulz} } @article {1439, title = {Notes on Hierarchical Splines, DCLNs and i-theory}, year = {2015}, abstract = {

We define an extension of classical additive splines for multivariate
function approximation that we call hierarchical splines. We show that the
case of hierarchical, additive, piece-wise linear splines includes present-day
Deep Convolutional Learning Networks (DCLNs) with linear rectifiers and
pooling (sum or max). We discuss how these observations together with
i-theory may provide a framework for a general theory of deep networks.

}, author = {Tomaso Poggio and Lorenzo Rosasco and Amnon Shashua and Nadav Cohen and F. Anselmi} } @conference {1896, title = {One Shot Learning by Composition of Meaningful Patches}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2015}, month = {12/2015}, address = {Santiago, Chile}, abstract = {
The task of discriminating one object from another is almost trivial for a human being. However, this task is computationally taxing for most modern machine learning methods; whereas, we perform this task at ease given very few examples for learning. It has been proposed that the quick grasp of concept may come from the shared knowledge between the new example and examples previously learned. We believe that the key to one-shot learning is the sharing of common parts as each part holds immense amounts of information on how a visual concept is constructed. We propose an unsupervised method for learning a compact dictionary of image patches representing meaningful components of an objects. Using those patches as features, we build a compositional model that outperforms a number of popular algorithms on a one-shot learning task. We demonstrate the effectiveness of this approach on hand-written digits and show that this model generalizes to multiple datasets.
}, author = {Alex Wong and Alan Yuille} } @conference {2580, title = {One Shot Learning via Compositions of Meaningful Patches}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2015}, abstract = {

The task of discriminating one object from another is al- most trivial for a human being. However, this task is compu- tationally taxing for most modern machine learning meth- ods; whereas, we perform this task at ease given very few examples for learning.\  It has been proposed that the quick grasp of concept may come from the shared knowledge be- tween\  the\  new\  example\  and\  examples\  previously\  learned. We believe that the key to one-shot learning is the sharing of common parts as each part holds immense amounts of in- formation on how a visual concept is constructed.\  We pro- pose an unsupervised method for learning a compact dictio- nary of image patches representing meaningful components of an objects.\  Using those patches as features, we build a compositional model that outperforms a number of popu- lar algorithms on a one-shot learning task. We demonstrate the effectiveness of this approach on hand-written digits and show that this model generalizes to multiple datasets.

}, author = {Alex Wong and Alan Yuille} } @article {1831, title = {Optogenetic feedback control of neural activity.}, journal = {Elife}, volume = {4}, year = {2015}, month = {2015}, pages = {e07192}, abstract = {

Optogenetic techniques enable precise excitation and inhibition of firing in specified neuronal populations and artifact-free recording of firing activity. Several studies have suggested that optical stimulation provides the precision and dynamic range requisite for closed-loop neuronal control, but no approach yet permits feedback control of neuronal firing. Here we present the {\textquoteright}optoclamp{\textquoteright}, a feedback control technology that provides continuous, real-time adjustments of bidirectional optical stimulation in order to lock spiking activity at specified targets over timescales ranging from seconds to days. We demonstrate how this system can be used to decouple neuronal firing levels from ongoing changes in network excitability due to multi-hour periods of glutamatergic or GABAergic neurotransmission blockade in vitro as well as impinging vibrissal sensory drive in vivo. This technology enables continuous, precise optical control of firing in neuronal populations in order to disentangle causally related variables of circuit activation in a physiologically and ethologically relevant manner.

}, keywords = {Action Potentials, Cytological Techniques, Feedback, Humans, Neurons, Optogenetics}, issn = {2050-084X}, doi = {10.7554/eLife.07192}, author = {Jonathan P Newman and Fong, Ming-fai and Millard, Daniel C and Whitmire, Clarissa J and Stanley, Garrett B and Potter, Steve M} } @article {2060, title = {Our Mother the Machine, by Dan Rockmore [Huffpost] }, year = {2015}, month = {05/2015}, publisher = {Huffington Post}, abstract = {

Prof. Dan Rockmore, Director of the Neukom Institute for Computational Science - Dartmouth College and member of the CBMM External Advisory Committe (EAC), has contributed an article to the Huffington Pist which may be of interest to the CBMM Commnity.

"The last couple years were pretty great years in the movies for artificial intelligence. "Her" -- the first movie ever about a relationship between an operating system and a person -- as well as the "The Imitation Game" -- about the father of AI, Alan Turing -- each won Oscars for their screenplays in addition to earning multiple nominations, including for best picture. Central to both of these movies was the famous "Turing Test," a thought experiment for providing a benchmark for achieving a level of human intelligence in a machine, wherein a human is given the opportunity to "converse" with a hidden partner, either another person or a machine. If the conversational guinea pig mistakes a machine for a person, then technology has finally made the great leap forward in the bridging of brains with bits and bytes. ... "

}, url = {http://www.huffingtonpost.com/dan-rockmore/our-mother-the-machine_b_7273504.html}, author = {Dan Rockmore} } @article {1355, title = {Parsing Occluded People by Flexible Compositions}, number = {034}, year = {2015}, month = {06/1/2015}, abstract = {

This paper presents an approach to parsing humans when there is significant occlusion. We model humans using a graphical model which has a tree structure building on recent work [32, 6] and exploit the connectivity prior that, even in presence of occlusion, the visible nodes form a connected subtree of the graphical model. We call each connected subtree a flexible composition of object parts. This involves a novel method for learning occlusion cues. During inference we need to search over a mixture of different flexible models. By exploiting part sharing, we show that this inference can be done extremely efficiently requiring only twice as many computations as searching for the entire object (i.e., not modeling occlusion). We evaluate our model on the standard benchmarked {\textquotedblleft}We Are Family" Stickmen dataset and obtain significant performance improvements over the best alternative algorithms.\ 

}, author = {Xianjie Chen and Alan Yuille} } @article {1750, title = {Parts-based representations of perceived face movements in the superior temporal sulcus}, year = {2015}, month = {11/19/2015}, address = {Chicago, IL}, url = {https://www.sfn.org/~/media/SfN/Documents/Annual\%20Meeting/FinalProgram/NS2015/Daily\%20Books\%202015/AM15Monday.ashx}, author = {Ben Deen and Rebecca Saxe} } @conference {1803, title = {Perceiving Fully Occluded Objects with Physical Simulation}, booktitle = {Cognitive Science Conference (CogSci)}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, author = {Ilker Yildirim and Max Siegel and Joshua B. Tenenbaum} } @conference {1084, title = {Picture: An Imperative Probabilistic Programming Language for Scene Perception}, booktitle = {Computer Vision and Pattern Recognition}, year = {2015}, author = {Tejas Kulkarni and Pushmeet Kohli and Joshua B. Tenenbaum and Vikash Mansinghka} } @article {1800, title = {Population Coding, Correlations, and Functional Connectivity in the mouse visual system with the Cortical Activity Map (CAM)}, year = {2015}, month = {09/19/2015}, type = {Poster}, abstract = {

The Cortical Activity Map will provide neural responses from large sets of simultaneously recorded cells to a diverse set of visual stimuli from awake, behaving mice in multiple layers, regions, and cell types. This data set allows for unprecedented access to population responses and provides a unique opportunity to explore the collective characteristics of neural dynamics. The visual stimuli for CAM include gratings, sparse noise, spatio-temporal noise, simple objects, natural images, and natural movies. We demonstrate the power of this data set by exploring the nature of population coding in the visual system. To assess information processing across visual areas we develop decoders. We analyze and compare the performance of these decoders for each stimulus type. In particular, we compare the performance of correlation based decoders and those with functional connectivity to independent decoders. For more complex stimuli, we use these models as the basis for reconstruction of the visual stimulus. This set of analyses demonstrates that the Cortical Activity Map will be a powerful tool for exploring the joint activity of large populations of neurons.

}, author = {Michael Buice and Saskia de Vries}, editor = {Amy Bernard and Brandon Rogeres and Casey White and Chinh Dang and Pete Groblewski and Chris Lau and Cliff Slaughterbeck and Colin Farrell and Derric Williams and Jack Waters and Jed Perkins and Kate Roll and Leonard Kuan and Lydia Ng and Marina Garrett and Natalia Orlova and Shawn Olsen and Sissy Cross and Stefan Mihalas and Thomas Keenan and Wayne Wakeman and John Phillips and Christof Koch and Clay Reid} } @article {1487, title = {Predicting actions before they occur}, year = {2015}, month = {10/27/2015}, abstract = {
Humans are experts at reading others{\textquoteright} actions in social contexts. They efficiently process others{\textquoteright} movements in real-time to predict intended goals. Here we designed a two-person reaching task to investigate real-time body reading in a naturalistic setting. Two Subjects faced each other separated by a plexiglass screen. One (Attacker) was instructed to tap one of two targets on the screen and the other (Blocker) was told to tap the same target as quickly as possible. Reaction times were fast, much faster than reaction times to a dot projected on the screen moving in the same manner. This suggests Blockers use subtle preparatory movements of Attackers to predict their goal. Next, using video recordings of an Attacker, we showed that removing the preparatory cues slows reaction times and changing them could trick the Blockers to choose the wrong target. We then occluded various body parts of the Attacker and showed that reaction times slow down only when most of the body of the Attacker is occluded. This suggests that preparatory cues are distributed over the body of the Attacker. We saw no evidence of learning during the experiment as reaction times remained constant over the duration of the session. Taken together, these results suggest that in social contexts humans are able to use their knowledge of the biomechanical constraints on the human body to efficiently process preparatory cues from the body of their interaction partner in order to predict their intentions well before movement begins.
}, keywords = {Action anticipation, Action reading, Biological motion, Social interaction}, author = {Maryam Vaziri-Pashkam}, editor = {Sarah Cormiea} } @article {2693, title = {Preverbal Infants{\textquoteright} Third-Party Imitator Preferences: Animated Displays versus Filmed Actors}, year = {2015}, month = {08/2015}, address = {MIT, Cambridge, MA}, author = {Heather L Kosakowski and Lindsey J Powell and Elizabeth S Spelke} } @proceedings {1203, title = {Quit while you{\textquoteright}re ahead: Preschoolers{\textquoteright} persistence and willingness to accept challenges are affected by social comparison.}, year = {2015}, month = {07/2015}, address = {Pasadena, CA}, author = {Rachel Magid and Laura Schulz} } @conference {869, title = {Reorientation ability predicts early spatial symbol reading}, booktitle = {2015 Society for Research in Child Development Biennial Meeting}, year = {2015}, author = {Moira R Dillon and Elizabeth S Spelke} } @proceedings {924, title = {Responsibility judgments in voting scenarios}, year = {2015}, month = {07/22/2015}, pages = {788-793}, address = {Pasadena, CA}, issn = {978-0-9911967-2-2}, url = {https://mindmodeling.org/cogsci2015/papers/0143/index.html}, author = {Tobias Gerstenberg and Joseph Y Halpern and Joshua B. Tenenbaum} } @article {694, title = {A Review of Relational Machine Learning for Knowledge Graphs: From Multi-Relational Link Prediction to Automated Knowledge Graph Construction}, number = {028}, year = {2015}, month = {03/2015}, abstract = {

Relational machine learning studies methods for the statistical analysis of relational, or graph-structured, data. In this paper, we provide a review of how such statistical models can be {\textquotedblleft}trained{\textquotedblright} on large knowledge graphs, and then used to predict new facts about the world (which is equivalent to predicting new edges in the graph). In particular, we discuss two different kinds of statistical relational models, both of which can scale to massive datasets. The first is based on tensor factorization methods and related latent variable models. The second is based on mining observable patterns in the graph. We also show how to combine these latent and observable models to get improved modeling power at decreased computational cost. Finally, we discuss how such statistical models of graphs can be combined with text-based information extraction methods for automatically constructing knowledge graphs from the Web. In particular, we discuss Google{\textquoteright}s Knowledge Vault project.

}, author = {Maximilian Nickel and Kevin Murphy and Volker Tresp and Evgeniy Gabrilovich} } @proceedings {1883, title = {Scene-Domain Active Part Models for Object Representation}, year = {2015}, month = {12/2015}, pages = { 2497 - 2505 }, address = {Santiago, Chile}, abstract = {

In this paper, we are interested in enhancing the expressivity\ and robustness of part-based models for object representation,\ in the common scenario where the training data\ are based on 2D images. To this end, we propose scene-domain\ active part models (SDAPM), which reconstruct\ and characterize the 3D geometric statistics between object{\textquoteright}s\ parts in 3D scene-domain by using 2D training data\ in the image-domain alone. And on top of this, we explicitly\ model and handle occlusions in SDAPM. Together with the\ developed learning and inference algorithms, such a model\ provides rich object descriptions, including 2D object and\ parts localization, 3D landmark shape and camera viewpoint,\ which offers an effective representation to various image\ understanding tasks, such as object and parts detection,\ 3D landmark shape and viewpoint estimation from images.\ Experiments on the above tasks show that SDAPM outperforms\ previous part-based models, and thus demonstrates\ the potential of the proposed technique.

}, doi = {10.1109/ICCV.2015.287}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=7410644\&tag=1}, author = {Zhou Ren and Chaohui Wang and Alan Yuille} } @article {2063, title = {A Science of Intelligence }, year = {2015}, month = {07/2015}, abstract = {

We are in the midst of a revolution in machine intelligence, the engineering of getting computers to perform tasks that, until recently, could only be done by people. You can speak to your smart phone and it answers back, software identifies faces at border-crossings and labels people and objects in pictures posted to social media. Algorithms can teach themselves to play Atari video games. A camera and chip embedded into the front view-mirror of top-of-the-line sedans let the vehicle drive autonomously on the open road...

}, author = {Christof Koch and Tomaso Poggio} } @conference {2581, title = {Semantic Part Segmentation using Compositional Model combing Shape and Appearance}, booktitle = {CVPR}, year = {2015}, abstract = {

In this paper, we study the problem of semantic part seg- mentation for animals. This is more challenging than stan- dard object detection, object segmentation and pose estima- tion tasks because semantic parts of animals often have sim- ilar appearance and highly varying shapes. To tackle these challenges, we build a mixture of compositional models to represent the object boundary and the boundaries of seman- tic parts.\ \  And we incorporate edge,\  appearance,\  and se- mantic part cues into the compositional model. Given part- level\  segmentation\  annotation,\  we\  develop\  a\  novel\  algo- rithm to learn a mixture of compositional models under var- ious poses and viewpoints for certain animal classes.\  Fur- thermore, a linear complexity algorithm is offered for effi- cient inference of the compositional model using dynamic programming.\  We evaluate our method for horse and cow using a newly annotated dataset on Pascal VOC 2010 which has pixelwise part labels. Experimental results demonstrate the effectiveness of our method.

}, author = {Jianyu Wang and Alan Yuille} } @article {1154, title = {Sensitivity to timing and order in human visual cortex}, journal = {Journal of Neurophysiology}, volume = {113}, year = {2015}, month = {Jan-03-2015}, pages = {1656 - 1669}, issn = {0022-3077}, doi = {10.1152/jn.00556.2014}, url = {http://jn.physiology.org/lookup/doi/10.1152/jn.00556.2014}, author = {Jedediah Singer and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {1809, title = {Six-month-old infants represent action efficiency on a continuous scale.}, year = {2015}, month = {10/2015}, address = {Columbus, Ohio}, author = {Shari Liu and Elizabeth S Spelke} } @article {1164, title = {Towards a Programmer{\textquoteright}s Apprentice (Again)}, number = {030}, year = {2015}, month = {04/2015}, abstract = {

Programmers are loathe to interrupt their workflow to document their design rationale, leading to frequent errors when software is modified{\textemdash}often much later and by different programmers. A Pro- grammer{\textquoteright}s Assistant could interact with the programmer to capture and preserve design rationale, in a natural way that would make rationale capture {\textquotedblleft}cost less than it{\textquoteright}s worth{\textquotedblright}, and could also detect common flaws in program design. Such a programmer{\textquoteright}s assistant was not practical when it was first proposed decades ago, but advances over the years make now the time to revisit the concept, as our prototype shows.

}, author = {Howard Shrobe and Boris Katz and Randall Davis} } @article {1059, title = {Unconscious perception of an opponent{\textquoteright}s goal}, year = {2015}, month = {09/2015}, abstract = {

Humans are experts at reading others{\textquoteright} actions. They effortlessly navigate a crowded street or reach for a handshake without grabbing an elbow. This suggests real-time, efficient processing of others{\textquoteright} movements and the ability to predict intended future movements. We designed a competitive reaching task where two subjects faced each other separated by a plexiglass screen. Fingertip positions were recorded with magnetic sensors. One subject (Attacker) was instructed via headphones to tap one of two targets on the screen and the other subject (Blocker) was told to try to reach the same target as quickly as possible. Reaction times, measured as the difference in initial finger movement (finger launch) of Attacker and Blocker were fast (~150ms): much faster than reaction times to a moving dot projected on the screen (~250ms). This suggests Blockers use preparatory actions of Attackers to predict their goal before finger launch. Next, we videotaped an Attacker and projected the video onto the transparent screen. Blockers{\textquoteright} reaction times to the videos matched those to a real Attacker. In half the blocks we cut the preparatory information from the video. Blockers were ~120ms slower responding to cut videos, suggesting preparatory information is predictive ~120ms before finger launch. Finally we played the videos from the start to various times relative to finger launch and asked subjects to report the Attacker{\textquoteright}s goal with button presses. Surprisingly when videos were cut at ~120ms before finger launch subjects{\textquoteright} accuracy was ~70\%: significantly lower than the accuracy of arm movements in response to full videos (~98\%). This suggests that in the arm movement task subjects utilize implicit information that is not consciously accessible in the button press task. Taken together, these results suggest participants in a competitive interaction have implicit or unconscious knowledge of the intentions of their partner before movement begins.

}, doi = {10.1167/15.12.43}, url = {http://jov.arvojournals.org/article.aspx?articleid=2433081}, author = {Sarah Cormiea and Maryam Vaziri-Pashkam and Ken Nakayama} } @article {1415, title = {Unsupervised learning of invariant representations}, journal = {Theoretical Computer Science}, year = {2015}, month = {06/25/2015}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity. We consider the case of visual object recognition, though the theory also applies to other domains like speech. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translation, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and selective signature can be computed for each image or image patch: the invariance can be exact in the case of group transformations and approximate under non-group transformations. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such signature. The theory offers novel unsupervised learning algorithms for {\textquotedblleft}deep{\textquotedblright} architectures for image and speech recognition. We conjecture that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and selective for recognition{\textemdash}and show how this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {convolutional networks, Cortex, Hierarchy, Invariance}, doi = {10.1016/j.tcs.2015.06.048}, url = {http://www.sciencedirect.com/science/article/pii/S0304397515005587}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @article {1570, title = {UNSUPERVISED LEARNING OF VISUAL STRUCTURE USING PREDICTIVE GENERATIVE NETWORKS}, year = {2015}, month = {12/15/2015}, abstract = {

The ability to predict future states of the environment is a central pillar of intelligence. At its core, effective prediction requires an internal model of the world and an understanding of the rules by which the world changes. Here, we explore the internal models developed by deep neural networks trained using a loss based on predicting future frames in synthetic video sequences, using an Encoder-Recurrent-Decoder framework (Fragkiadaki et al., 2015). We first show that this architecture can achieve excellent performance in visual sequence prediction tasks, including state-of-the-art performance in a standard {\textquotedblleft}bouncing balls{\textquotedblright} dataset (Sutskever et al., 2009). We then train on clips of out-of-the-plane rotations of computer-generated faces, using both mean-squared error and a generative adversarial loss (Goodfellow et al., 2014), extending the latter to a recurrent, conditional setting. Despite being trained end-to-end to predict only pixel-level information, our Predictive Generative Networks learn a representation of the latent variables of the underlying generative process. Importantly, we find that this representation is naturally tolerant to object transformations, and generalizes well to new tasks, such as classification of static images. Similar models trained solely with a reconstruction loss fail to generalize as effectively. We argue that prediction can serve as a powerful unsupervised loss for learning rich internal representations of high-level object features.

}, author = {William Lotter and Gabriel Kreiman and David Cox} } @article {1260, title = {Using fNIRS to Map Functional Specificity in the Infant Brain: An fROI Approach}, year = {2015}, author = {Lindsey J Powell and Ben Deen and Li Guo and Rebecca Saxe} } @article {1761, title = {Visual categorization of social interactions}, journal = {Visual Cognition }, volume = {22}, year = {2015}, month = {02/06/2015}, abstract = {

Prominent theories of action recognition suggest that during the recognition of actions the physical patterns of the action is associated with only one action interpretation (e.g., a person waving his arm is recognized as waving). In contrast to this view, studies examining the visual categorization of objects show that objects are recognized in multiple ways (e.g., a VW Beetle can be recognized as a car or a beetle) and that categorization performance is based on the visual and motor movement similarity between objects. Here, we studied whether we find evidence for multiple levels of categorization for social interactions (physical interactions with another person, e.g., handshakes). To do so, we compared visual categorization of objects and social interactions (Experiments 1 and 2) in a grouping task and assessed the usefulness of motor and visual cues (Experiments 3, 4, and 5) for object and social interaction categorization. Additionally, we measured recognition performance associated with recognizing objects and social interactions at different categorization levels (Experiment 6). We found that basic level object categories were associated with a clear recognition advantage compared to subordinate recognition but basic level social interaction categories provided only a little recognition advantage. Moreover, basic level object categories were more strongly associated with similar visual and motor cues than basic level social interaction categories. The results suggest that cognitive categories underlying the recognition of objects and social interactions are associated with different performances. These results are in line with the idea that the same action can be associated with several action interpretations (e.g., a person waving his arm can be recognized as waving or greeting).

}, doi = {10.1080/13506285.2014.991368}, author = {Stephan de la Rosa and Rabia N. Choudhery and Crist{\'o}bal Curio and Shimon Ullman and Liav Assif and Heinrich H. B{\"u}lthoff} } @article {2062, title = {What if...}, year = {2015}, month = {06/2015}, abstract = {

The background: DCLNs (Deep Convolutional Learning Networks) are doing very well

Over the last 3 years and increasingly so in the last few months, I have seen supervised DCLNs {\textemdash} feedforward and recurrent {\textemdash} do more and more of everything quite well. They seem to learn good representations for a growing number of speech and text problems (for a review by the pioneers in the field see LeCun, Bengio, Hinton, 2015). More interestingly, it is increasingly clear, as I will discuss later, that instead of being trained on millions of labeled examples they can be trained in implicitly supervised ways. This breakthrough in machine learning triggers a few dreams. What if we have now the basic answer to how to develop brain-like intelligence and its basic building blocks?...

}, author = {Tomaso Poggio} } @article {1466, title = {Whole-agent selectivity within the macaque face-processing system}, journal = {Proceedings of the National Academy of Sciences (PNAS)}, volume = {112}, year = {2015}, month = {10/2015}, chapter = {14717}, abstract = {

The primate brain contains a set of face-selective areas, which are thought to extract the rich social information that faces provide, such as emotional state and personal identity. The nature of this information raises a fundamental question about these face-selective areas: Do they respond to a face purely because of its visual attributes, or because the face embodies a larger social agent? Here, we used functional magnetic resonance imaging to determine whether the macaque face patch system exhibits a whole-agent response above and beyond its responses to individually presented faces and bodies. We found a systematic development of whole-agent preference through the face patches, from subadditive integration of face and body responses in posterior face patches to superadditive integration in anterior face patches. Superadditivity was not observed for faces atop nonbody objects, implying categorical specificity of face{\textendash}body interaction. Furthermore, superadditivity was robust to visual degradation of facial detail, suggesting whole-agent selectivity does not require prior face recognition. In contrast, even the body patches immediately adjacent to anterior face areas did not exhibit superadditivity. This asymmetry between face- and body-processing systems may explain why observers attribute bodies{\textquoteright} social signals to faces, and not vice versa. The development of whole-agent selectivity from posterior to anterior face patches, in concert with the recently described development of natural motion selectivity from ventral to dorsal face patches, identifies a single face patch, AF (anterior fundus), as a likely link between the analysis of facial shape and semantic inferences about other agents.

}, issn = {0027-8424}, doi = {10.1073/pnas.1512378112 }, url = {http://www.pnas.org/content/112/47/14717.abstract}, author = {Clark Fisher and W. A. Freiwald} } @inbook {4069, title = {Working Memory Representations of Visual Motion along the Primate Dorsal Visual Pathway}, booktitle = {Mechanisms of Sensory Working Memory: Attention and Performance XXV.}, year = {2015}, publisher = {Elsevier Inc. }, organization = {Elsevier Inc. }, abstract = {

Mechanisms of Sensory Working Memory: Attention and Performance XXV provides an update on the research surrounding the memory processes that are crucial for many facets of cognitive processing and experience, with new coverage of emerging areas of study, including a new understanding of working memory for features of stimuli devoid of verbal, phonological, or long-term memory content, such as memory for simple visual features (e.g., texture or color), simple auditory features (e.g., pitch), or simple tactile features (e.g., vibration frequency), now called sensory memory to distinguish from verbal memory.

This contemporary focus on sensory memory is just beginning, and this collection of original contributions provides a foundational reference for the study mechanisms of sensory memory.

}, issn = {978-0-12-801371-7}, url = {https://www.sciencedirect.com/book/9780128013717/mechanisms-of-sensory-working-memory}, author = {Diego Mendoza-Halliday and Torres, S. and Julio Martinez-Trujillo} } @conference {868, title = {Young children{\textquoteright}s automatic and alternating use of scene and object information in spatial symbols.}, booktitle = {Budapest CEU Conference on Cognitive Development}, year = {2015}, abstract = {

Although symbolic understanding has long been studied, little is known about the 2D shape information children use to relate symbols to their 3D referents. Our previous research suggests that young children rely on length and angle to find locations on objects, but on distance and direction to find locations in scenes. These studies, however, either presented drawings from non-canonical perspectives or probed children{\textquoteright}s use of symbols in unusual environments. Moreover, these studies explored the factors that limit children{\textquoteright}s understanding of spatial symbols, not the sources of their flexibility in this domain.

For the present study, we showed 144 4-year-old children three types of drawings of a typical room, depicting the room{\textquoteright}s objects, its extended surfaces, or both. In one task, children used the drawings to find targets located either at the junction of two extended surfaces in a room or next to objects in the room. In another task, children judged whether drawings that include just scene or just object information are better depictions of targets at these two types of locations.

\ \ \ \ \ \ \ \ \ \ \  We found that the limitations previously observed in children{\textquoteright}s use of spatial symbols extend to highly realistic perspectival drawings: children perform better with scene drawings when targets are located at the junctions of extended surfaces in the room and better with object targets when targets are located near objects, but gain no additional benefit when presented with both types of information. In addition, children show no awareness of this pattern in their performance: they judge drawings of objects to be more informative of all target locations. Common drawings evidently present geometric information in a format automatically accessible to cognitive systems for navigation and object recognition. Young children nevertheless fail to integrate the information that these systems represent, even when shown drawings of the most familiar and natural kinds.

}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {461, title = {Abstracts of the 2014 Brains, Minds, and Machines Summer Course}, number = {024}, year = {2014}, month = {09/2014}, abstract = {

A compilation of abstracts from the student projects of the 2014 Brains, Minds, and Machines Summer School, held at Woods Hole Marine Biological Lab, May 29 - June 12, 2014.

}, author = {Nadav Amir and Tarek R. Besold and Raffaello Camoriano and Goker Erdogan and Thomas Flynn and Grant Gillary and Jesse Gomez and Ariel Herbert-Voss and Gladia Hotan and Jonathan Kadmon and Scott W. Linderman and Tina T. Liu and Andrew Marantan and Joseph Olson and Garrick Orchard and Dipan K. Pal and Giulia Pasquale and Honi Sanders and Carina Silberer and Kevin A Smith and Carlos Stein N. de Briton and Jordan W. Suchow and M. H. Tessler and Guillaume Viejo and Drew Walker and Leila Wehbe and Andrei Barbu and Leyla Isik and Emily Mackevicius and Yasmine Meroz} } @article {357, title = {Can a biologically-plausible hierarchy effectively replace face detection, alignment, and recognition pipelines?}, number = {003}, year = {2014}, month = {03/2014}, abstract = {

The standard approach to unconstrained face recognition in natural photographs is via a detection, alignment, recognition pipeline. While that approach has achieved impressive results, there are several reasons to be dissatisfied with it, among them is its lack of biological plausibility. A recent theory of invariant recognition by feedforward hierarchical networks, like HMAX, other convolutional networks, or possibly the ventral stream, implies an alternative approach to unconstrained face recognition. This approach accomplishes detection and alignment implicitly by storing transformations of training images (called templates) rather than explicitly detecting and aligning faces at test time. Here we propose a particular locality-sensitive hashing based voting scheme which we call {\textquotedblleft}consensus of collisions{\textquotedblright} and show that it can be used to approximate the full 3-layer hierarchy implied by the theory. The resulting end-to-end system for unconstrained face recognition operates on photographs of faces taken under natural conditions, e.g., Labeled Faces in the Wild (LFW), without aligning or cropping them, as is normally done. It achieves a drastic improvement in the state of the art on this end-to-end task, reaching the same level of performance as the best systems operating on aligned, closely cropped images (no outside training data). It also performs well on two newer datasets, similar to LFW, but more difficult: LFW-jittered (new here) and SUFR-W.

}, keywords = {Computer vision, Face recognition, Hierarchy, Invariance}, author = {Qianli Liao and JZ. Leibo and Youssef Mroueh and Tomaso Poggio} } @article {452, title = {The Compositional Nature of Event Representations in the Human Brain}, number = {011}, year = {2014}, month = {07/14/2014}, abstract = {

How does the human brain represent simple compositions of constituents: actors, verbs, objects, directions, and locations? Subjects viewed videos during neuroimaging (fMRI) sessions from which sentential descriptions of those videos were identified by decoding the brain representations based only on their fMRI activation patterns. Constituents (e.g., fold and shirt) were independently decoded from a single presentation. Independent constituent classification was then compared to joint classification of aggregate concepts (e.g., fold -shirt); results were similar as measured by accuracy and correlation. The brain regions used for independent constituent classification are largely disjoint and largely cover those used for joint classification. This allows recovery of sentential descriptions of stimulus videos by composing the results of the independent constituent classifiers. Furthermore, classifiers trained on the words one set of subjects think of when watching a video can recognize sentences a different subject thinks of when watching a different video.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {443, title = {Computational role of eccentricity dependent cortical magnification.}, number = {017}, year = {2014}, month = {06/2014}, abstract = {

We develop a sampling extension of M-theory focused on invariance to scale and translation. Quite surprisingly, the theory predicts an architecture of early vision with increasing receptive field sizes and a high resolution fovea {\textemdash} in agreement with data about the cortical magnification factor, V1 and the retina. From the slope of the inverse of the magnification factor, M-theory predicts a cortical {\textquotedblleft}fovea{\textquotedblright} in V1 in the order of 40 by 40 basic units at each receptive field size {\textemdash} corresponding to a foveola of size around 26 minutes of arc at the highest resolution, ≈6 degrees at the lowest resolution. It also predicts uniform scale invariance over a fixed range of scales independently of eccentricity, while translation invariance should depend linearly on spatial frequency. Bouma{\textquoteright}s law of crowding follows in the theory as an effect of cortical area-by-cortical area pooling; the Bouma constant is the value expected if the signature responsible for recognition in the crowding experiments originates in V2. From a broader perspective, the emerging picture suggests that visual recognition under natural conditions takes place by composing information from a set of fixations, with each fixation providing recognition from a space-scale image fragment {\textemdash} that is an image patch represented at a set of increasing sizes and decreasing resolutions.

}, keywords = {Invariance, Theories for Intelligence}, author = {Tomaso Poggio and Jim Mutch and Leyla Isik} } @article {449, title = {Concepts in a Probabilistic Language of Thought.}, number = {010}, year = {2014}, month = {06/2014}, abstract = {

Knowledge organizes our understanding of the world, determining what we expect given what we have already seen. Our predictive representations have two key properties: they are productive, and they are graded. Productive generalization is possible because our knowledge decomposes into concepts{\textemdash}elements of knowledge that are combined and recombined to describe particular situations. Gradedness is the observable effect of accounting for uncertainty{\textemdash}our knowledge encodes degrees of belief that lead to graded probabilistic predictions. To put this a different way, concepts form a combinatorial system that enables description of many different situations; each such situation specifies a distribution over what we expect to see in the world, given what we have seen. We may think of this system as a probabilistic language of thought (PLoT) in which representations are built from language-like composition of concepts and the content of those representations is a probability distribution on world states. The purpose of this chapter is to formalize these ideas in computational terms, to illustrate key properties of the PLoT approach with a concrete example, and to draw connections with other views of
conceptual structure.

Note: The book chapter is reprinted courtesy of The MIT Press, from the forthcoming edited collection {\textquotedblleft}The Conceptual Mind: New Directions in the Study of Concepts{\textquotedblright} edited by Eric Margolis and Stephen Laurence, print date Spring 2015.

}, keywords = {Development of Intelligence}, author = {Noah D. Goodman and Joshua B. Tenenbaum and Tobias Gerstenberg} } @article {866, title = {Core geometry in perspective}, journal = {Developmental Science}, year = {2014}, month = {11/2014}, abstract = {

Research on animals, infants, children, and adults provides evidence that distinct cognitive systems underlie navigation and object recognition. Here we examine whether and how these systems interact when children interpret 2D edge-based perspectival line drawings of scenes and objects. Such drawings serve as symbols early in development, and they preserve scene and object geometry from canonical points of view. Young children show limits when using geometry both in non-symbolic tasks and in symbolic map tasks that present 3D contexts from unusual, unfamiliar points of view. When presented with the familiar viewpoints in perspectival line drawings, however, do children engage more integrated geometric representations? In three experiments, children successfully interpreted line drawings with respect to their depicted scene or object. Nevertheless, children recruited distinct processes when navigating based on the information in these drawings, and these processes depended on the context in which the drawings were presented. These results suggest that children are flexible but limited in using geometric information to form integrated representations of scenes and objects, even when interpreting spatial symbols that are highly familiar and faithful renditions of the visual world.

}, doi = {10.1111/desc.12266}, author = {Moira R Dillon and Elizabeth S Spelke} } @article {692, title = {Corticocortical feedback increases the spatial extent of normalization.}, journal = {Frontiers in Systems Neuroscience}, volume = {8}, year = {2014}, month = {05/30/2014 }, pages = {105}, abstract = {

Normalization has been proposed as a canonical computation operating across different brain regions, sensory modalities, and species. It provides a good phenomenological description of non-linear response properties in primary visual cortex (V1), including the contrast response function and surround suppression. Despite its widespread application throughout the visual system, the underlying neural mechanisms remain largely unknown. We recently observed that corticocortical feedback contributes to surround suppression in V1, raising the possibility that feedback acts through normalization. To test this idea, we characterized area summation and contrast response properties in V1 with and without feedback from V2 and V3 in alert macaques and applied a standard normalization model to the data. Area summation properties were well explained by a form of divisive normalization, which computes the ratio between a neuron{\textquoteright}s driving input and the spatially integrated activity of a "normalization pool." Feedback inactivation reduced surround suppression by shrinking the spatial extent of the normalization pool. This effect was independent of the gain modulation thought to mediate the influence of contrast on area summation, which remained intact during feedback inactivation. Contrast sensitivity within the receptive field center was also unaffected by feedback inactivation, providing further evidence that feedback participates in normalization independent of the circuit mechanisms involved in modulating contrast gain and saturation. These results suggest that corticocortical feedback contributes to surround suppression by increasing the visuotopic extent of normalization and, via this mechanism, feedback can play a critical role in contextual information processing.

}, issn = {1662-5137}, doi = {10.3389/fnsys.2014.00105}, url = {http://journal.frontiersin.org/article/10.3389/fnsys.2014.00105/abstract}, author = {Jonathan J. Nassi and Camille Gomez-Laberge and Gabriel Kreiman and Richard T Born} } @inbook {1129, title = {Data Analysis techniques for human microwire recordings: spike detection and sorting, decoding, relation between units and local field potentials}, booktitle = {Single Neuron Studies of the Brain: Probing Cognition}, year = {2014}, chapter = {6}, author = {Rutishauser, U and Moran Cerf and Gabriel Kreiman} } @article {227, title = {A Deep Representation for Invariance And Music Classification}, number = {002}, year = {2014}, month = {03/2014}, abstract = {

Representations in the auditory cortex might be based on mechanisms similar to the visual ventral stream; modules for building invariance to transformations and multiple layers for compositionality and selectivity. In this paper we propose the use of such computational modules for extracting invariant and discriminative audio representations. Building on a theory of invariance in hierarchical architectures, we propose a novel, mid-level representation for acoustical signals, using the empirical distributions of projections on a set of templates and their transformations. Under the assumption that, by construction, this dictionary of templates is composed from similar classes, and samples the orbit of variance-inducing signal transformations (such as shift and scale), the resulting signature is theoretically guaranteed to be unique, invariant to transformations and stable to deformations. Modules of projection and pooling can then constitute layers of deep networks, for learning composite representations. We present the main theoretical and computational aspects of a framework for unsupervised learning of invariant audio representations, empirically evaluated on music genre classification.

}, keywords = {Audio Representation, Hierarchy, Invariance, Machine Learning, Theories for Intelligence}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @conference {1141, title = {A Deep Representation for Invariance and Music Classification}, booktitle = {ICASSP 2014 - 2014 IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2014}, month = {05/04/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Florence, Italy}, keywords = {acoustic signal processing, signal representation, unsupervised learning}, doi = {10.1109/ICASSP.2014.6854954}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6854954}, author = {Chiyuan Zhang and Georgios Evangelopoulos and Stephen Voinea and Lorenzo Rosasco and Tomaso Poggio} } @article {453, title = {Detect What You Can: Detecting and Representing Objects using Holistic Models and Body Parts.}, number = {015}, year = {2014}, month = {06/2014}, abstract = {

Detecting objects becomes difficult when we need to deal with large shape deformation, occlusion and low resolution. We propose a novel approach to i) handle large deformations and partial occlusions in animals (as examples of highly deformable objects), ii) describe them in terms of body parts, and iii) detect them when their body parts are hard to detect (e.g., animals depicted at low resolution). We represent the holistic object and body parts separately and use a fully connected model to arrange templates for the holistic object and body parts. Our model automatically decouples the holistic object or body parts from the model when they are hard to detect. This enables us to represent a large number of holistic object and body part combinations to better deal with different {\textquotedblleft}detectability{\textquotedblright} patterns caused by deformations, occlusion and/or low resolution.
We apply our method to the six animal categories in the PASCAL VOC dataset and show that our method significantly improves state-of-the-art (by 4.1\% AP) and provides a richer representation for objects. During training we use annotations for body parts (e.g., head, torso, etc), making use of a new dataset of fully annotated object parts for PASCAL VOC 2010, which provides a mask for each part.

}, author = {Xianjie Chen and Roozbeh Mottaghi and Xiaobai Liu and Sanja Fidler and Raquel Urtasun and Alan Yuille} } @article {389, title = {The dynamics of invariant object recognition in the human visual system.}, journal = {J Neurophysiol}, volume = {111}, year = {2014}, month = {01/2014}, pages = {91-102}, abstract = {

The human visual system can rapidly recognize objects despite transformations that alter their appearance. The precise timing of when the brain computes neural representations that are invariant to particular transformations, however, has not been mapped in humans. Here we employ magnetoencephalography decoding analysis to measure the dynamics of size- and position-invariant visual information development in the ventral visual stream. With this method we can read out the identity of objects beginning as early as 60 ms. Size- and position-invariant visual information appear around 125 ms and 150 ms, respectively, and both develop in stages, with invariance to smaller transformations arising before invariance to larger transformations. Additionally, the magnetoencephalography sensor activity localizes to neural sources that are in the most posterior occipital regions at the early decoding times and then move temporally as invariant information develops. These results provide previously unknown latencies for key stages of human-invariant object recognition, as well as new and compelling evidence for a feed-forward hierarchical model of invariant object recognition where invariance increases at each successive visual area along the ventral stream.

Corresponding Dataset - The dynamics of invariant object recognition in the human visual system.

}, keywords = {Adolescent, Adult, Evoked Potentials, Visual, Female, Humans, Male, Pattern Recognition, Visual, Reaction Time, visual cortex}, issn = {1522-1598}, doi = {10.1152/jn.00394.2013}, url = {http://jn.physiology.org/content/early/2013/09/27/jn.00394.2013.abstract}, author = {Leyla Isik and Ethan Meyers and JZ. Leibo and Tomaso Poggio} } @article {2288, title = {The dynamics of invariant object recognition in the human visual system.}, year = {2014}, month = {01/2014}, abstract = {

This is the dataset for corresponding Journal Article - The dynamics of invariant object recognition in the human visual system.

The human visual system can rapidly recognize objects despite transformations that alter their appearance. The precise timing of when the brain computes neural representations that are invariant to particular transformations, however, has not been mapped in humans. Here we employ magnetoencephalography decoding analysis to measure the dynamics of size- and position-invariant visual information development in the ventral visual stream. With this method we can read out the identity of objects beginning as early as 60 ms. Size- and position-invariant visual information appear around 125 ms and 150 ms, respectively, and both develop in stages, with invariance to smaller transformations arising before invariance to larger transformations. Additionally, the magnetoencephalography sensor activity localizes to neural sources that are in the most posterior occipital regions at the early decoding times and then move temporally as invariant information develops. These results provide previously unknown latencies for key stages of human-invariant object recognition, as well as new and compelling evidence for a feed-forward hierarchical model of invariant object recognition where invariance increases at each successive visual area along the ventral stream.

Dataset files can be downloaded here - http://dx.doi.org/10.7910/DVN/KRUPXZ

11 subjects{\textquoteright} MEG data from Isik et al., 2014. Data is available in raw .fif format or in Matlab raster format that is compatible with the neural decoding toolbox (readout.info).

For Matlab code to pre-process this MEG data, and run the decoding analyses please visit

https://bitbucket.org/lisik/meg_decoding

}, doi = {http://dx.doi.org/10.7910/DVN/KRUPXZ}, author = {Leyla Isik and Ethan Meyers and JZ. Leibo and Tomaso Poggio} } @article {1307, title = {Dynamics of random neural networks with bistable units.}, journal = {Phys Rev E Stat Nonlin Soft Matter Phys}, volume = {90}, year = {2014}, author = {Stern, M and H Sompolinsky and Abbott, L.F.} } @article {1037, title = {Explaining Monkey Face Patch System as Efficient Analysis-by-Synthesis}, year = {2014}, author = {Ilker Yildirim and Tejas Kulkarni and W. A. Freiwald and Joshua B. Tenenbaum} } @article {940, title = {Exploring the functional organization of the superior temporal sulcus with a broad set of naturalistic stimuli}, year = {2014}, author = {Ben Deen and Nancy Kanwisher and Rebecca Saxe} } @conference {1055, title = {A framework for studying synaptic plasticity with neural spike train data}, booktitle = {Neural Information Processing Systems}, year = {2014}, month = {12/2014}, abstract = {

Learning and memory in the brain are implemented by complex, time-varying changes in neural circuitry. The computational rules according to which synaptic weights change over time are the subject of much research, and are not precisely understood. Until recently, limitations in experimental methods have made it challenging to test hypotheses about synaptic plasticity on a large scale. However, as such data become available and these barriers are lifted, it becomes necessary to develop analysis techniques to validate plasticity models. Here, we present a highly extensible framework for modeling arbitrary synaptic plasticity rules on spike train data in populations of interconnected neurons. We treat synaptic weights as a (potentially nonlinear) dynamical system embedded in a fully-Bayesian generalized linear model (GLM). In addition, we provide an algorithm for inferring synaptic weight trajectories alongside the parameters of the GLM and of the learning rules. Using this method, we perform model comparison of two proposed variants of the well-known spike-timing-dependent plasticity (STDP) rule, where nonlinear effects play a substantial role. On synthetic data generated from the biophysical simulator NEURON, we show that we can recover the weight trajectories, the pattern of connectivity, and the underlying learning rules.

}, author = {Scott W. Linderman and Christopher Stock and Ryan Adams} } @article {448, title = {The Genesis Story Understanding and Story Telling System A 21st Century Step toward Artificial Intelligence.}, number = {019}, year = {2014}, month = {06/2014}, abstract = {

Story understanding is an important differentiator of human intelligence, perhaps the most important differentiator. The Genesis system was built to model and explore aspects of story understanding using simply expressed, 20-100 sentence stories drawn from sources ranging from fairy tales to Shakespeare{\textquoteright}s plays. I describe Genesis at work as it reflects on its reading, searching for concepts, reads stories with controllable allegiances and cultural biases, models personality traits, answers basic questions about why and when, notes concept onsets, anticipating trouble, calculates similarity using concepts, models question-driven interpretation, aligns similar stories for analogical reasoning, develops summaries, and tells and persuades using a reader model. I conclude with thoughts on how Genesis would describe people in pictures and video, thus engaging with the CBMM challenge problem.

}, keywords = {Visual Intelligence}, author = {Patrick Henry Winston} } @article {454, title = {Human-Machine CRFs for Identifying Bottlenecks in Holistic Scene Understanding.}, number = {020}, year = {2014}, month = {06/2014}, abstract = {

Recent trends in image understanding have pushed for holistic scene understanding models that jointly reason about various tasks such as object detection, scene recognition, shape analysis, contextual reasoning, and local appearance based classifiers. In this work, we are interested in understanding the roles of these different tasks in improved scene understanding, in particular semantic segmentation, object detection and scene recognition. Towards this goal, we {\textquotedblleft}plug-in{\textquotedblright} human subjects for each of the various components in a state-of-the-art conditional random field model. Comparisons among various hybrid human-machine CRFs give us indications of how much {\textquotedblleft}head room{\textquotedblright} there is to improve scene understanding by focusing research efforts on various individual tasks.

}, author = {Roozbeh Mottaghi and Sanja Fidler and Alan Yuille and Raquel Urtasun and Devi Parikh} } @article {1342, title = {Imitation Preferences of Preverbal Infants.}, year = {2014}, month = {08/2014}, publisher = {Poster presentation at the Center for Brain Minds and Machines Summer Conference, Cambridge, MA}, author = {Heather L Kosakowski and Lindsey J Powell and Elizabeth S Spelke} } @article {438, title = {The Invariance Hypothesis Implies Domain-Specific Regions in Visual Cortex}, number = {004}, year = {2014}, month = {04/2014}, abstract = {

Is visual cortex made up of general-purpose information processing machinery, or does it consist of a collection of specialized modules? If prior knowledge, acquired from learning a set of objects is only transferable to new objects that share properties with the old, then the recognition system{\textquoteright}s optimal organization must be one containing specialized modules for different object classes. Our analysis starts from a premise we call the invariance hypothesis: that the computational goal of the ventral stream is to compute an invariant-to-transformations and discriminative signature for recognition. The key condition enabling approximate transfer of invariance without sacrificing discriminability turns out to be that the learned and novel objects transform similarly. This implies that the optimal recognition system must contain subsystems trained only with data from similarly-transforming objects and suggests a novel interpretation of domain-specific regions like the fusiform face area (FFA). Furthermore, we can define an index of transformation-compatibility, computable from videos, that can be combined with information about the statistics of natural vision to yield predictions for which object categories ought to have domain-specific regions. The result is a unifying account linking the large literature on view-based recognition with the wealth of experimental evidence concerning domain-specific regions.

}, keywords = {Neuroscience, Theories for Intelligence}, doi = {10.1101/004473}, url = {http://biorxiv.org/lookup/doi/10.1101/004473}, author = {JZ. Leibo and Qianli Liao and F. Anselmi and Tomaso Poggio} } @article {451, title = {Learning An Invariant Speech Representation}, number = {022}, year = {2014}, month = {06/2014}, abstract = {

Recognition of speech, and in particular the ability to generalize and learn from small sets of labelled examples like humans do, depends on an appropriate representation of the acoustic input. We formulate the problem of finding robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain and empirically evaluate its validity for voiced speech sound classification. Our version of the theory requires the memory-based, unsupervised storage of acoustic templates {\textemdash} such as specific phones or words {\textemdash} together with all the transformations of each that normally occur. A quasi-invariant representation for a speech segment can be obtained by projecting it to each template orbit, i.e., the set of transformed signals, and computing the associated one-dimensional empirical probability distributions. The computations can be performed by modules of filtering and pooling, and extended to hierarchical architectures. In this paper, we apply a single-layer, multicomponent representation for phonemes and demonstrate improved accuracy and decreased sample complexity for vowel classification compared to standard spectral, cepstral and perceptual features.

}, keywords = {Theories for Intelligence}, author = {Georgios Evangelopoulos and Stephen Voinea and Chiyuan Zhang and Lorenzo Rosasco and Tomaso Poggio} } @conference {222, title = {Learning invariant representations and applications to face verification}, booktitle = {NIPS 2013}, year = {2014}, month = {02/2014}, publisher = {Advances in Neural Information Processing Systems 26}, organization = {Advances in Neural Information Processing Systems 26}, address = {Lake Tahoe, Nevada}, abstract = {

One approach to computer object recognition and modeling the brain{\textquoteright}s ventral stream involves unsupervised learning of representations that are invariant to common transformations. However, applications of these ideas have usually been limited to 2D affine transformations, e.g., translation and scaling, since they are easiest to solve via convolution. In accord with a recent theory of transformation-invariance [1], we propose a model that, while capturing other common convolutional networks as special cases, can also be used with arbitrary identity-preserving transformations. The model{\textquoteright}s wiring can be learned from videos of transforming objects{\textemdash}or any other grouping of images into sets by their depicted object. Through a series of successively more complex empirical tests, we study the invariance/discriminability properties of this model with respect to different transformations. First, we empirically confirm theoretical predictions (from [1]) for the case of 2D affine transformations. Next, we apply the model to non-affine transformations; as expected, it performs well on face verification tasks requiring invariance to the relatively smooth transformations of 3D rotation-in-depth and changes in illumination direction. Surprisingly, it can also tolerate clutter {\textquotedblleft}transformations{\textquotedblright} which map an image of a face on one background to an image of the same face on a different background. Motivated by these empirical findings, we tested the same model on face verification benchmark tasks from the computer vision literature: Labeled Faces in the Wild, PubFig [2, 3, 4] and a new dataset we gathered{\textemdash}achieving strong performance in these highly unconstrained cases as well.

}, keywords = {Computer vision}, url = {http://nips.cc/Conferences/2013/Program/event.php?ID=4074}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @conference {216, title = {Machine Learning Based Automated Fault Detection in Seismic Traces}, booktitle = {EAGE Conference and Exhibition 2014}, year = {2014}, month = {06/2014}, address = {The Netherlands}, abstract = {

Introduction:

The Initial stages of velocity model building (VMB) start off from smooth models that capture geological assumptions of the subsurface region under analysis. Acceptable velocity models result from successive iterations of human intervention (interpreter) and seismic data processing with in complex workflows. The interpreters ensure that any additions or corrections made by seismic processing are compliant with geological and geophysical knowledge. The information that seismic processing adds to the model consists of structural elements, faults are one of the most relevant of those events since they can signal reservoir boundaries or hydrocarbon traps. Faults are excluded in the initial models due to their local scale. Bringing faults into the model in early stages can help to steer the VMB process.

This work introduced a tool whose purpose is to assist the interpreters during the initial stages of the VMB, when no seismic data has been migrated. Our novel method is based on machine learning techniques and can automatically identify and localize faults from not migrated seismic data. Comprehensive research has targeted the fault localization problem, but most of the results are obtained using processed seismic data or images as input (Admasu and Toennies (2004); Tingdahl et al. (2001); Cohen et al. (2006); Hale (2013), etc). Our approach suggests an additional tool that can be used to speed up the
VMB process.

Fully automated VMB has not been achieved because the human factor is difficult to formalize in a way that can be systematically applied. Nonetheless, if our framework is extended to other seismic events or attributes, it might become a powerful tool to alleviate interpreters{\textquoteright} work.

}, url = {http://cbcl.mit.edu/publications/eage14.pdf}, author = {Chiyuan Zhang and Charlie Frogner and Mauricio Araya-Polo and Detlef Hohl} } @inbook {1132, title = {Neural correlates of consciousness: perception and volition}, booktitle = {Cognitive Neuroscience}, volume = {V}, year = {2014}, author = {Gabriel Kreiman} } @article {1134, title = {Neural Dynamics Underlying Target Detection in the Human Brain}, journal = {Journal of Neuroscience}, volume = {34}, year = {2014}, chapter = {3042}, author = {Bansal, A and Radhika Madhavan and Agam, Y and Golby, A and Joseph Madsen and Gabriel Kreiman} } @article {225, title = {Neural Mechanisms of Object-Based Attention}, journal = {Science}, volume = {344}, year = {2014}, month = {04/2014}, pages = {424 - 427}, abstract = {

How we attend to objects and their features that cannot be separated by location is not understood. We presented two temporally and spatially overlapping streams of objects, faces versus houses, and used magnetoencephalography and functional magnetic resonance imaging to separate neuronal responses to attended and unattended objects. Attention to faces versus houses enhanced the sensory responses in the fusiform face area (FFA) and parahippocampal place area (PPA), respectively. The increases in sensory responses were accompanied by induced gamma synchrony between the inferior frontal junction, IFJ, and either FFA or PPA, depending on which object was attended. The IFJ appeared to be the driver of the synchrony, as gamma phases were advanced by 20 ms in IFJ compared to FFA or PPA. Thus, the IFJ may direct the flow of visual processing during object-based attention, at least in part through coupled oscillations with specialized areas such as FFA and PPA.

}, keywords = {Face recognition, Theories for Intelligence, Visual Intelligence}, issn = {0036-8075}, doi = {10.1126/science.1247003}, url = {http://www.sciencemag.org/cgi/doi/10.1126/science.1247003}, author = {Baldauf, D. and R. Desimone} } @article {450, title = {Neural tuning size is a key factor underlying holistic face processing.}, number = {021}, year = {2014}, month = {06/2014}, abstract = {

Faces are a class of visual stimuli with unique significance, for a variety of reasons. They are ubiquitous throughout the course of a person{\textquoteright}s life, and face recognition is crucial for daily social interaction. Faces are also unlike any other stimulus class in terms of certain physical stimulus characteristics. Furthermore, faces have been empirically found to elicit certain characteristic behavioral phenomena, which are widely held to be evidence of {\textquotedblleft}holistic{\textquotedblright} processing of faces. However, little is known about the neural mechanisms underlying such holistic face processing. In other words, for the processing of faces by the primate visual system, the input and output characteristics are relatively well known, but the internal neural computations are not. The main aim of this work is to further the fundamental understanding of what causes the visual processing of faces to be different from that of objects. In this computational modeling work, we show that a single factor {\textendash} {\textquotedblleft}neural tuning size{\textquotedblright} {\textendash} is able to account for three key phenomena that are characteristic of face processing, namely the Composite Face Effect (CFE), Face Inversion Effect (FIE) and Whole - Part Effect (WPE). Our computational proof - of - principle provides specific neural tuning properties that correspond to the poorly - understood notion of holistic face processing, and connects these neural properties to psychophysical behavior. Overall, our work provides a unified and parsimonious theoretical account for the disparate empirical data on face - specific processing, deepening the fundamental understanding of face processing

}, keywords = {Theories for Intelligence}, author = {Cheston Tan and Tomaso Poggio} } @inbook {1131, title = {The next ten years and beyond}, booktitle = {Single neuron studies of the human brain. Probing cognition}, year = {2014}, chapter = {19}, author = {Gabriel Kreiman and Rutishauser, U and Moran Cerf and Itzhak Fried} } @article {458, title = {A Nonparametric Bayesian Approach to Uncovering Rat Hippocampal Population Codes During Spatial Navigation.}, number = {027}, year = {2014}, month = {11/2014}, abstract = {

Rodent hippocampal population codes represent important spatial information about the environment during navigation. Several computational methods have been developed to uncover the neural representation of spatial topology embedded in rodent hippocampal ensemble spike activity. Here we extend our previous work and propose a nonparametric Bayesian approach to infer rat hippocampal population codes during spatial navigation. To tackle the model selection problem, we leverage a nonparametric Bayesian model. Specifically, to analyze rat hippocampal ensemble spiking activity, we apply a hierarchical Dirichlet process-hidden Markov model (HDP-HMM) using two Bayesian inference methods, one based on Markov chain Monte Carlo (MCMC) and the other based on variational Bayes (VB). We demonstrate the effectiveness of our Bayesian approaches on recordings from a freely-behaving rat navigating in an open field environment. We find that MCMC-based inference with Hamiltonian Monte Carlo (HMC) hyperparameter sampling is flexible and efficient, and outperforms VB and MCMC approaches with hyperparameters set by empirical Bayes.

}, author = {Scott W. Linderman and Matthew J. Johnson and Matthew A. Wilson and Zhe Chen} } @article {441, title = {A normalization model of visual search predicts single trial human fixations in an object search task.}, number = {008}, year = {2014}, month = {04/2014}, abstract = {

When searching for an object in a scene, how does the brain decide where to look next? Theories of visual search suggest the existence of a global attentional map, computed by integrating bottom-up visual information with top-down, target-specific signals. Where, when and how this integration is performed remains unclear. Here we describe a simple mechanistic model of visual search that is consistent with neurophysiological and neuroanatomical constraints, can localize target objects in complex scenes, and predicts single-trial human behavior in a search task among complex objects. This model posits that target-specific modulation is applied at every point of a retinotopic area selective for complex visual features and implements local normalization through divisive inhibition. The combination of multiplicative modulation and divisive normalization creates an attentional map in which aggregate activity at any location tracks the correlation between input and target features, with relative and controllable independence from bottom-up saliency. We first show that this model can localize objects in both composite images and natural scenes and demonstrate the importance of normalization for successful search. We next show that this model can predict human fixations on single trials, including error and target-absent trials. We argue that this simple model captures non-trivial properties of the attentional system that guides visual search in humans.

}, keywords = {Circuits for Intelligence, Pattern recognition}, author = {Thomas Miconi and Laura Groomes and Gabriel Kreiman} } @article {444, title = {Parsing Semantic Parts of Cars Using Graphical Models and Segment Appearance Consistency.}, number = {018}, year = {2014}, month = {06/2014}, abstract = {

This paper addresses the problem of semantic part parsing (segmentation) of cars, i.e.assigning every pixel within the car to one of the parts (e.g.body, window, lights, license plates and wheels). We formulate this as a landmark identification problem, where a set of landmarks specifies the boundaries of the parts. A novel mixture of graphical models is proposed, which dynamically couples the landmarks to a hierarchy of segments. When modeling pairwise relation between landmarks, this coupling enables our model to exploit the local image contents in addition to spatial deformation, an aspect that most existing graphical models ignore. In particular, our model enforces appearance consistency between segments within the same part. Parsing the car, including finding the optimal coupling between landmarks and segments in the hierarchy, is performed by dynamic programming. We evaluate our method on a subset of PASCAL VOC 2010 car images and on the car subset of 3D Object Category dataset (CAR3D). We show good results and, in particular, quantify the effectiveness of using the segment appearance consistency in terms of accuracy of part localization and segmentation.

}, author = {Wenhao Lu and Xiaochen Lian and Alan Yuille} } @article {386, title = {People, objects and interactions in movies}, year = {2014}, month = {06/2014}, abstract = {

This database contains annotations for commercial movies including information about presence/ absence of specific people, their viewpoints, their motion, their emotions, presence/absence of specific objects and their motion.

See the related publication - https://cbmm.mit.edu/publications/predicting-episodic-memory-formation-movie-events

See the related code - https://cbmm.mit.edu/publications/predicting-episodic-memory-formation-movie-events-code

Click HERE for to access the website to download the database \>

}, keywords = {Computer vision}, author = {Gabriel Kreiman} } @conference {220, title = {Phone Classification by a Hierarchy of Invariant Representation Layers}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

We propose a multi-layer feature extraction framework for speech, capable of providing invariant representations. A set of templates is generated by sampling the result of applying smooth, identity-preserving transformations (such as vocal tract length and tempo variations) to arbitrarily-selected speech signals. Templates are then stored as the weights of {\textquotedblleft}neurons{\textquotedblright}. We use a cascade of such computational modules to factor out different types of transformation variability in a hierarchy, and show that it improves phone classification over baseline features. In addition, we describe empirical comparisons of a) different transformations which may be responsible for the variability in speech signals and of b) different ways of assembling template sets for training. The proposed layered system is an effort towards explaining the performance of recent deep learning networks and the principles by which the human auditory cortex might reduce the sample complexity of learning in speech recognition. Our theory and experiments suggest that invariant representations are crucial in learning from complex, real-world data like natural speech. Our model is built on basic computational primitives of cortical neurons, thus making an argument about how representations might be learned in the human auditory cortex.

}, keywords = {Hierarchy, Invariance, Neural Networks, Speech Representation}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2346.html}, author = {Chiyuan Zhang and Stephen Voinea and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @article {390, title = {Predicting Saliency Beyond Pixels}, year = {2014}, month = {01/2014}, abstract = {

A large body of previous models to predict where people look in natural scenes focused on pixel-level image attributes. To bridge the semantic gap between the predictive power of computational saliency models and human behavior, we propose a new saliency architecture that incorporates information at three layers: pixel-level image attributes, object-level attributes, and semantic-level attributes. Object- and semantic-level information is frequently ignored, or only a few sample object categories are discussed where scaling to a large number of object categories is not feasible nor neurally plausible. To address this problem, this work constructs a principled vocabulary of basic attributes to describe object- and semantic-level information thus not restricting to a limited number of object categories. We build a new dataset of 700 images with eye-tracking data of 15 viewers and annotation data of 5551 segmented objects with fine contours and 12 semantic attributes. Experimental results demonstrate the importance of the object- and semantic-level information in the prediction of visual attention.

}, url = {http://www.ece.nus.edu.sg/stfpage/eleqiz/predicting.html}, author = {Juan Xu and Ming Jiang and Shuo Wang and Mohan Kankanhalli and Qi Zhao} } @conference {1298, title = {Preschoolers expect others to learn rationally from evidence}, booktitle = {Annual Conference of the Cognitive Science Society}, year = {2014}, abstract = {

Even infants expect agents to act rationally in pursuit of their\  goals. However, little research has looked at whether young\  children expect other agents to learn rationally. In the\  current study, we investigated 4.5- to 6-year-olds{\textquoteright} reasoning\  about another agent{\textquoteright}s beliefs after the agent observed a\  sample drawn randomly or selectively from a population.\  We found that those children who could correctly track both\  the true state of the world and the other agent{\textquoteright}s initial beliefs\  expected the other agent to learn rationally from the data.\  Critically, this inference depended upon but could not be\  reduced to either the child{\textquoteright}s own understanding of the world,\  or the child{\textquoteright}s own inferences from the sampling process,\  suggesting that the ability to integrate these component\  processes underlies a developing understanding of the way\  in which evidence informs others{\textquoteright} beliefs.

}, keywords = {learning, rational action, theory of mind}, author = {Phyllis Yan and Rachel Magid and Laura Schulz} } @inbook {918, title = {Querying Factorized Probabilistic Triple Databases}, booktitle = {The Semantic Web {\textendash} ISWC 2014}, series = {Lecture Notes in Computer Science}, volume = {8797}, year = {2014}, pages = {114-129}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, isbn = {978-3-319-11914-4}, doi = {10.1007/978-3-319-11915-1_8}, url = {http://dx.doi.org/10.1007/978-3-319-11915-1_8}, author = {Krompa{\ss}, Denis and Maximilian Nickel and Volker Tresp} } @article {439, title = {Reconstructing Native Language Typology from Foreign Language Usage.}, number = {007}, year = {2014}, month = {04/2014}, abstract = {

Linguists and psychologists have long been studying cross-linguistic transfer, the influence of native language properties on linguistic performance in a foreign language. In this work we provide empirical evidence for this process in the form of a strong correlation between language similarities derived from structural features in English as Second Language (ESL) texts and equivalent similarities obtained directly from the typological features of the native languages. We leverage this finding to recover native language typological similarity structure directly from ESL text, and perform prediction of typological features in an unsupervised fashion with respect to the target languages. Our method achieves 72.2\% accuracy on the typology prediction task, a result that is highly competitive with equivalent methods that rely on typological resources.

}, keywords = {language, linguistics, Visual Intelligence}, author = {Yevgeni Berzak and Roi Reichart and Boris Katz} } @inbook {920, title = {Reducing the Rank in Relational Factorization Models by Including Observable Patterns}, booktitle = {Advances in Neural Information Processing Systems 27}, year = {2014}, pages = {1179{\textendash}1187}, publisher = {Curran Associates, Inc.}, organization = {Curran Associates, Inc.}, abstract = {
Tensor factorization has become a popular method for learning from multi-relational data. In this context, the rank of the factorization is an important parameter that determines runtime as well as generalization ability. To identify conditions under which factorization is an efficient approach for learning from relational data,we derive upper and lower bounds on the rank required to recover adjacency tensors.Based on our findings, we propose a novel additive tensor factorization modelto learn from latent and observable patterns on multi-relational data and present
a scalable algorithm for computing the factorization. We show experimentallyboth that the proposed additive model does improve the predictive performanceover pure latent variable methods and that it also reduces the required rank {\textemdash} andtherefore runtime and memory complexity {\textemdash} significantly.
}, url = {http://papers.nips.cc/paper/5448-reducing-the-rank-in-relational-factorization-models-by-including-observable-patterns.pdf}, author = {Maximilian Nickel and Jiang, Xueyan and Volker Tresp} } @article {457, title = {Representation Learning in Sensory Cortex: a theory.}, number = {026}, year = {2014}, month = {11/2014}, abstract = {

We review and apply a computational theory of the feedforward path of the ventral stream in visual cortex based on the hypothesis that its main function is the encoding of invariant representations of images. A key justification of the theory is provided by a theorem linking invariant representations to small sample complexity for recognition {\textendash} that is, invariant representations allows learning from very few labeled examples. The theory characterizes how an algorithm that can be implemented by a set of {\textquotedblright}simple{\textquotedblright} and {\textquotedblright}complex{\textquotedblright} cells {\textendash} a {\textquotedblright}HW module{\textquotedblright} {\textendash} provides invariant and selective representations. The invariance can be learned in an unsupervised way from observed transformations. Theorems show that invariance implies several properties of the ventral stream organization, including the eccentricity dependent lattice of units in the retina and in V1, and the tuning of its neurons. The theory requires two stages of processing: the first, consisting of retinotopic visual areas such as V1, V2 and V4 with generic neuronal tuning, leads to representations that are invariant to translation and scaling; the second, consisting of modules in IT, with class- and object-specific tuning, provides a representation for recognition with approximate invariance to class specific transformations, such as pose (of a body, of a face) and expression. In the theory the ventral stream main function is the unsupervised learning of {\textquotedblright}good{\textquotedblright}
representations that reduce the sample complexity of the final supervised learning stage.

}, author = {F. Anselmi and Tomaso Poggio} } @article {2058, title = {Is Research in Intelligence an Existential Risk?}, year = {2014}, month = {12/2014}, abstract = {

Recent months have seen an increasingly public debate taking form around the risks of AI (Artificial Intelligence). A letter signed by Nobel prizes and other physicists defined AI as the top existential risk to mankind. More recently, Tesla CEO Elon Musk has been quoted saying that it is {\textquotedblleft}potentially more dangerous than nukes.{\textquotedblright} Physicist Stephen Hawking told the BBC that {\textquotedblleft}the development of full artificial intelligence could spell the end of the human race{\textquotedblright}. And of course recent films such as Her and Transcendence have reinforced the message. Thoughtful comments by experts in the field such as Rod Brooks, Oren Etsioni and others have done little to settle the debate.

As the Director of a new multi-institution, NSF-funded and MIT-based Science and Technology Center {\textemdash} called the Center for Brains, Minds and Machines (CBMM) {\textemdash} I am arguing here on behalf of my collaborators and many colleagues, that the terms of the debate should be fundamentally rephrased. Our vision of the Center{\textquoteright}s research integrates cognitive science, neuroscience, computer science, and artificial intelligence. Our belief is that understanding intelligence and replicating it in machines, goes hand in hand with understanding how the brain and the mind perform intelligent computations. The convergence and recent progress in technology, mathematics, and neuroscience has created a new opportunity for synergy across fields.\  The dream of understanding intelligence is an old one. Yet, as the debate around AI shows, now is an exciting time to pursue this vision.\  Our mission at CBMM is thus to establish an emerging field, the Science and Engineering of Intelligence. This integrated effort should ultimately make fundamental progress with great value to science, technology, and society. We believe that we must push ahead with research, not pull back.

}, author = {Tomaso Poggio} } @article {446, title = {Robust Estimation of 3D Human Poses from a Single Image.}, number = {013}, year = {2014}, month = {06/2014}, abstract = {

Human pose estimation is a key step to action recognition. We propose a method of estimating 3D human poses from a single image, which works in conjunction with an existing 2D pose/joint detector. 3D pose estimation is challenging because multiple 3D poses may correspond to the same 2D pose after projection due to the lack of depth information. Moreover, current 2D pose estimators are usually inaccurate which may cause errors in the 3D estimation. We address the challenges in three ways: (i) We represent a 3D pose as a linear combination of a sparse set of bases learned from 3D human skeletons. (ii) We enforce limb length constraints to eliminate anthropomorphically implausible skeletons. (iii) We estimate a 3D pose by minimizing the L1 -norm error between the projection of the 3D pose and the corresponding 2D detection. The L1-norm loss term is robust to inaccurate 2D joint estimations. We use the alternating direction method (ADM) to solve the optimization problem efficiently. Our approach outperforms the state-of-the-arts on three benchmark datasets.

}, author = {Chunyu Wang and Yizhou Wang and Zhouchen Lin and Alan Yuille and Wen Gao} } @article {456, title = {A role for recurrent processing in object completion: neurophysiological, psychophysical and computational evidence.}, number = {009}, year = {2014}, month = {04/2014}, abstract = {

Recognition of objects from partial information presents a significant challenge for theories of vision because it requires spatial integration and extrapolation from prior knowledge. We combined neurophysiological recordings in human cortex with psychophysical measurements and computational modeling to investigate the mechanisms involved in object completion. We recorded intracranial field potentials from 1,699 electrodes in 18 epilepsy patients to measure the timing and selectivity of responses along human visual cortex to whole and partial objects. Responses along the ventral visual stream remained selective despite showing only 9\>25 of the object. However, these visually selective signals emerged ~100 ms later for partial versus whole objects. The processing delays were particularly pronounced in higher visual areas within the ventral stream, suggesting the involvement of additional recurrent processing. In separate psychophysics experiments, disrupting this recurrent computation with a backward mask at ~75ms significantly impaired recognition of partial, but not whole, objects. Additionally, computational modeling shows that the performance of a purely bottom\>up architecture is impaired by heavy occlusion and that this effect can be partially rescued via the incorporation of top\>down connections. These results provide spatiotemporal constraints on theories of object recognition that involve recurrent processing to recognize objects from partial information.

}, author = {Hanlin Tang and Buia, Calin and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {445, title = {The Secrets of Salient Object Segmentation.}, number = {014}, year = {2014}, month = {06/2014}, abstract = {

In this paper we provide an extensive evaluation of fixation prediction and salient object segmentation algorithms as well as statistics of major datasets. Our analysis identifi es serious design flaws of existing salient object benchmarks, called the dataset design bias, by over emphasising the stereotypical concepts of saliency. The dataset design bias does not only create the discomforting disconnection between xations and salient object segmentation, but
also misleads the algorithm designing. Based on our analysis, we propose a new high quality dataset that off ers both fixation and salient object segmentation ground-truth. With fixations and salient object being presented simultaneously, we are able to bridge the gap between fixations and salient objects, and propose a novel method for salient object segmentation. Finally, we report significant benchmark progress on three existing datasets of segmenting salient objects.

}, author = {Yin Li and Christof Koch and James M. Rehg and Alan Yuille} } @article {1367, title = {Seeing is Worse than Believing: Reading People{\textquoteright}s Minds Better than Computer-Vision Methods Recognize Actions}, number = {012}, year = {2014}, month = {09/2014}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @inbook {1090, title = {Seeing is worse than believing: Reading people{\textquoteright}s minds better than computer-vision methods recognize actions}, booktitle = {Computer Vision {\textendash} ECCV 2014, Lecture Notes in Computer Science}, series = {13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V}, volume = {8693}, year = {2014}, pages = {612{\textendash}627}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Zurich, Switzerland}, abstract = {

We had human subjects perform a one-out-of-six class action recognition task from video stimuli while undergoing functional magnetic resonance imaging (fMRI). Support-vector machines (SVMs) were trained on the recovered brain scans to classify actions observed during imaging, yielding average classification accuracy of 69.73\% when tested on scans from the same subject and of 34.80\% when tested on scans from different subjects. An apples-to-apples comparison was performed with all publicly available software that implements state-of-the-art action recognition on the same video corpus with the same cross-validation regimen and same partitioning into training and test sets, yielding classification accuracies between 31.25\% and 52.34\%. This indicates that one can read people{\textquoteright}s minds better than state-of-the-art computer-vision methods can perform action recognition.

}, doi = {10.1007/978-3-319-10602-1_40}, author = {Andrei Barbu and Daniel Barrett and Wei Chen and N. Siddharth and Caiming Xiong and Jason J. Corso and Christiane D. Fellbaum and Catherine Hanson and Stephen Jos{\'e} Hanson and Sebastien Helie and Evguenia Malaia and Barak A. Pearlmutter and Jeffrey Mark Siskind and Thomas Michael Talavage and Ronnie B. Wilbur} } @article {442, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video.}, number = {006}, year = {2014}, month = {05/2014}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, thereby providing a medium, not only for top-down and bottom-up integration, but also for multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions) in the form of whole sentential descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multi-activity videos: sentence-guided focus of attention, generation of sentential descriptions of video, and query-based video search, simply by leveraging the framework in different manners.

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @conference {1089, title = {Seeing What You{\textquoteright}re Told: Sentence-Guided Activity Recognition In Video}, booktitle = {CVPR}, year = {2014}, month = {07/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Columbus, Ohio}, abstract = {

We present a system that demonstrates how the compositional structure of events, in concert with the compositional structure of language, can interplay with the underlying focusing mechanisms in video action recognition, providing a medium for top-down and bottom-up integration as well as multi-modal integration between vision and language. We show how the roles played by participants (nouns), their characteristics (adjectives), the actions performed (verbs), the manner of such actions (adverbs), and changing spatial relations between participants (prepositions), in the form of whole-sentence descriptions mediated by a grammar, guides the activity-recognition process. Further, the utility and expressiveness of our framework is demonstrated by performing three separate tasks in the domain of multiactivity video: sentence-guided focus of attention, generation of sentential description, and query-based search, simply by leveraging the framework in different manners

}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {1094, title = {Seeing what you{\textquoteright}re told, sentence guided activity recognition in video}, year = {2014}, publisher = {IEEE}, author = {N. Siddharth and Andrei Barbu and Jeffrey Mark Siskind} } @article {440, title = {Sensitivity to Timing and Order in Human Visual Cortex.}, number = {005}, year = {2014}, month = {04/2014}, abstract = {

Visual recognition takes a small fraction of a second and relies on the cascade of signals along the ventral visual stream. Given the rapid path through multiple processing steps between photoreceptors and higher visual areas, information must progress from stage to stage very quickly. This rapid progression of information suggests that fine temporal details of the neural response may be important to the how the brain encodes visual signals. We investigated how changes in the relative timing of incoming visual stimulation affect the representation of object information by recording intracranial field potentials along the human ventral visual stream while subjects recognized objects whose parts were presented with varying asynchrony. Visual responses along the ventral stream were sensitive to timing differences between parts as small as 17 ms. In particular, there was a strong dependency on the temporal order of stimulus presentation, even at short asynchronies. This sensitivity to the order of stimulus presentation provides evidence that the brain may use differences in relative timing as a means of representing information.

}, keywords = {Circuits for Intelligence, Pattern recognition, Visual}, author = {Jedediah Singer and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {1910, title = {Sharp emergence of feature-selective sustained activity along the dorsal visual pathway}, journal = {Nature Neuroscience}, volume = {7}, year = {2014}, month = {09/2014}, chapter = {1255}, abstract = {

Sustained activity encoding visual working memory representations has been observed in several cortical areas of primates. Where along the visual pathways this activity emerges remains unknown. Here we show in macaques that sustained spiking activity encoding memorized visual motion directions is absent in direction-selective neurons in early visual area middle temporal (MT). However, it is robustly present immediately downstream, in multimodal association area medial superior temporal (MST), as well as and in the lateral prefrontal cortex (LPFC). This sharp emergence of sustained activity along the dorsal visual pathway suggests a functional boundary between early visual areas, which encode sensory inputs, and downstream association areas, which additionally encode mnemonic representations. Moreover, local field potential oscillations in MT encoded the memorized directions and, in the low frequencies, were phase-coherent with LPFC spikes. This suggests that LPFC sustained activity modulates synaptic activity in MT, a putative top-down mechanism by which memory signals influence stimulus processing in early visual cortex.

}, doi = {doi:10.1038/nn.3785}, url = {http://www.nature.com/neuro/journal/v17/n9/abs/nn.3785.html}, author = {Diego Mendoza-Halliday and Santiago Torres and Julio Martinez-Trujillo} } @article {1156, title = {Short temporal asynchrony disrupts visual object recognition.}, journal = {J Vis}, volume = {14}, year = {2014}, month = {2014}, pages = {7}, abstract = {

Humans can recognize objects and scenes in a small fraction of a second. The cascade of signals underlying rapid recognition might be disrupted by temporally jittering different parts of complex objects. Here we investigated the time course over which shape information can be integrated to allow for recognition of complex objects. We presented fragments of object images in an asynchronous fashion and behaviorally evaluated categorization performance. We observed that visual recognition was significantly disrupted by asynchronies of approximately 30 ms, suggesting that spatiotemporal integration begins to break down with even small deviations from simultaneity. However, moderate temporal asynchrony did not completely obliterate recognition; in fact, integration of visual shape information persisted even with an asynchrony of 100 ms. We describe the data with a concise model based on the dynamic reduction of uncertainty about what image was presented. These results emphasize the importance of timing in visual processing and provide strong constraints for the development of dynamical models of visual shape recognition.

}, keywords = {Adult, Female, Form Perception, Humans, Male, Pattern Recognition, Visual, Psychophysics, Time Factors, Vision, Ocular, Visual Pathways, Young Adult}, issn = {1534-7362}, doi = {10.1167/14.5.7}, author = {Jedediah Singer and Gabriel Kreiman} } @article {1133, title = {Short temporal asynchrony disrupts visual object recognition}, journal = {Journal of Vision}, volume = {12}, year = {2014}, author = {Jedediah Singer and Gabriel Kreiman} } @article {2266, title = {Short temporal asynchrony disrupts visual object recognition}, year = {2014}, publisher = {Journal of Vision}, abstract = {

Intracranial field potential recordings, images and code from Liu et al. (2009). The data shows rapid responses along the ventral visual stream in the human brain with selectivity to faces and objects, tolerance to object transformations than can be decoded in single trials.

Used in conjunction with this mirrored CBMM Code entry

}, url = {http://klab.tch.harvard.edu/resources/singer_asynchrony.html}, author = {Jedediah Singer and Gabriel Kreiman} } @article {2260, title = {Short temporal asynchrony disrupts visual object recognition}, year = {2014}, publisher = {Journal of Vision}, abstract = {

Intracranial field potential recordings, images and code from Liu et al. (2009). The data shows rapid responses along the ventral visual stream in the human brain with selectivity to faces and objects, tolerance to object transformations than can be decoded in single trials

Used in conjunction with this mirrored CBMM Dataset entry

}, url = {http://klab.tch.harvard.edu/resources/singer_asynchrony.html}, author = {Jedediah Singer and Gabriel Kreiman} } @article {224, title = {Simultaneous whole-animal 3D imaging of neuronal activity using light-field microscopy}, journal = {Nature Methods}, volume = {11}, year = {2014}, month = {05/18/2014}, pages = {727 - 730}, abstract = {

High-speed, large-scale three-dimensional (3D) imaging of neuronal activity poses a major challenge in neuroscience. Here we demonstrate simultaneous functional imaging of neuronal activity at single-neuron resolution in an entire Caenorhabditis elegans and in larval zebrafish brain. Our technique captures the dynamics of spiking neurons in volumes of ~700 μm {\texttimes} 700 μm {\texttimes} 200 μm at 20 Hz. Its simplicity makes it an attractive tool for high-speed volumetric calcium imaging.

}, keywords = {Imaging, Neuroscience}, issn = {1548-7091}, doi = {10.1038/nmeth.2964}, url = {http://www.nature.com/doifinder/10.1038/nmeth.2964}, author = {Prevedel, Robert and Yoon, Young-Gyu and Hoffmann, Maximilian and Pak, Nikita and Wetzstein, Gordon and Kato, Saul and Schr{\"o}del, Tina and Raskar, Ramesh and Zimmer, Manuel and Edward S Boyden and Vaziri, Alipasha} } @book {1118, title = {Single Neuron Studies of the Human Brain. Probing Cognition}, series = {Probing cognition}, year = {2014}, publisher = {MIT Press}, organization = {MIT Press}, address = {Cambridge, MA}, author = {Itzhak Fried and Ueli Rutishauser and Moran Cerf and Gabriel Kreiman} } @article {217, title = {Spatiotemporal Dynamics Underlying Object Completion in Human Ventral Visual Cortex}, journal = {Neuron}, volume = {83}, year = {2014}, month = {08/06/2014}, pages = {736 - 748}, abstract = {

Natural vision often involves recognizing objects from partial information. Recognition of objects from parts presents a significant challenge for theories of vision because it requires spatial integration and extrapolation from prior knowledge. Here we recorded intracranial field potentials of 113 visually selective electrodes from epilepsy patients in response to whole and partial objects. Responses along the ventral visual stream, particularly the Inferior Occipital and Fusiform Gyri, remained selective despite showing only 9-25\% of the object areas. However, these visually selective signals emerged ~100 ms later for partial versus whole objects. These processing delays were particularly pronounced in higher visual areas within the ventral stream. This latency difference persisted when controlling for changes in contrast, signal amplitude, and the strength of selectivity. These results argue against a purely feed-forward explanation of recognition from partial information, and provide spatiotemporal constraints on theories of object recognition that involve recurrent processing.

}, keywords = {Circuits for Intelligence, vision}, issn = {08966273}, doi = {10.1016/j.neuron.2014.06.017}, url = {http://linkinghub.elsevier.com/retrieve/pii/S089662731400539Xhttp://api.elsevier.com/content/article/PII:S089662731400539X?httpAccept=text/xmlhttp://api.elsevier.com/content/article/PII:S089662731400539X?httpAccept=text/plain}, author = {Hanlin Tang and Buia, Calin and Radhika Madhavan and NE Crone and Joseph Madsen and WS Anderson and Gabriel Kreiman} } @article {1140, title = {Speech Representations based on a Theory for Learning Invariances}, year = {2014}, month = {10/2014}, type = {poster presentation}, address = {SANE 2014 - Speech and Audio in the Northeast}, abstract = {

Recognition of sounds and speech from a small number of labelled examples (like humans do), depends on the properties of the representation of the acoustic input. We formulate the problem of extracting robust speech features for supervised learning with small sample complexity as a problem of learning representations of the signal that are maximally invariant to intraclass transformations and deformations. We propose an extension of a theory for unsupervised learning of invariant visual representations to the auditory domain, that requires the memory-based, unsupervised storage of acoustic templates -- such as specific phones or words -- together with all the transformations of each that normally occur. A quasi-invariant representation for a speech signal can be obtained by projecting it to a number of template orbits, i.e., each one a set of transformed template signals, and computing the associated one-dimensional empirical probability distributions. The computations are perfomed by modules of filtering and pooling, that can be used for obtaining a mapping in single- or multilayer architectures. We consider several aspects of such representations including different signal scales (word vs. frame), input domains (raw waveforms vs. frequency filterbank responses), structures (shallow vs.\ multilayer/hierarchical), and ways of sampling from template orbit sets given a set of observations (explicit vs. learned). Preliminary empirical evaluations for learning to separate speech phones and words are given on TIMIT and subsets of TI-DIGITS.\ 

}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @conference {1320, title = {The strategic use of noise in pragmatic reasoning}, year = {2014}, author = {Leon Bergen and Noah D. Goodman} } @article {219, title = {Subtasks of Unconstrained Face Recognition}, year = {2014}, month = {01/2014}, publisher = {9th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications. (VISAPP).}, address = {Lisbon, Portugal}, abstract = {

Unconstrained face recognition remains a challenging computer vision problem despite recent exceptionally high results (\~{} 95\% accuracy) on the current gold standard evaluation dataset: Labeled Faces in the Wild (LFW) (Huang et al., 2008; Chen et al., 2013). We offer a decomposition of the unconstrained problem into subtasks based on the idea that invariance to identity-preserving transformations is the crux of recognition. Each of the subtasks in the Subtasks of Unconstrained Face Recognition (SUFR) challenge consists of a same-different face-matching problem on a set of 400 individual synthetic faces rendered so as to isolate a specific transformation or set of transformations. We characterized the performance of 9 different models (8 previously published) on each of the subtasks. One notable finding was that the HMAX-C2 feature was not nearly as clutter-resistant as had been suggested by previous publications (Leibo et al., 2010; Pinto et al., 2011). Next we considered LFW and argued that it is too easy of a task to continue to be regarded as a measure of progress on unconstrained face recognition. In particular, strong performance on LFW requires almost no invariance, yet it cannot be considered a fair approximation of the outcome of a detection{\textrightarrow}alignment pipeline since it does not contain the kinds of variability that realistic alignment systems produce when working on non-frontal faces. We offer a new, more difficult, natural image dataset: SUFR-in-the-Wild (SUFR-W), which we created using a protocol that was similar to LFW, but with a few differences designed to produce more need for transformation invariance. We present baseline results for eight different face recognition systems on the new dataset and argue that it is time to retire LFW and move on to more difficult evaluations for unconstrained face recognition.

Click here for more information on related dataset \>

}, keywords = {Face identification, Invariance, Labeled Faces in the Wild, Same-different matching, Synthetic data}, author = {JZ. Leibo and Qianli Liao and Tomaso Poggio} } @article {384, title = {Subtasks of unconstrained face recognition}, year = {2014}, month = {01/2014}, abstract = {

This package contains:

1. \ SUFR-W, a dataset of {\textquotedblleft}in the wild{\textquotedblright} natural images of faces gathered from the internet. The protocol used to create the dataset is described in Leibo, Liao and Poggio (2014).

2. \ The full set of SUFR synthetic datasets, called the {\textquotedblleft}Subtasks of Unconstrained Face Recognition Challenge{\textquotedblright} in Leibo, Liao and Poggio (2014).

Click here for more information \& download \>

Click here to download the data set directly \>

}, keywords = {Computer vision}, author = {JZ. Leibo and Qianli Liao and Tomaso Poggio} } @inbook {4419, title = {Tomaso A. Poggio}, booktitle = {The History of Neuroscience in Autobiography Volume 8}, volume = {8}, year = {2014}, month = {04/2014}, publisher = {Society for Neuroscience}, organization = {Society for Neuroscience}, abstract = {

Tomaso Poggio began his career in collaboration with Werner Reichardt quantitatively characterizing the visuomotor control system in the fly. With David Marr, he introduced the seminal idea of levels of analysis in computational neuroscience. He introduced regularization as a mathematical framework to approach the ill-posed problems of vision and{\textemdash}more importantly{\textemdash}the key problem of learning from data. He has contributed to the early development of the theory of learning{\textemdash}in particular introducing the mathematics of radial basis functions (RBF){\textemdash}and has supervised learning in reproducing kernel Hilbert spaces (RKHSs) and stability. In the last decade, he has developed an influential quantitative model of visual recognition in the visual cortex, recently extended in a theory of sensory perception. He is one of the most cited computational scientists with contributions ranging from the biophysical and behavioral studies of the visual system to the computational analyses of vision and learning in humans and machines.

}, isbn = {978-0-615-94079-3}, url = {https://www.sfn.org/about/history-of-neuroscience/autobiographical-chapters}, author = {Tomaso Poggio and Larry R. Squire} } @article {455, title = {Unsupervised learning of clutter-resistant visual representations from natural videos.}, number = {023}, year = {2014}, month = {09/2014}, abstract = {

Populations of neurons in inferotemporal cortex (IT) maintain an explicit code for object identity that also tolerates transformations of object appearance e.g., position, scale, viewing angle [1, 2, 3]. Though the learning rules are not known, recent results [4, 5, 6] suggest the operation of an unsupervised temporal-association-based method e.g., Foldiak{\textquoteright}s trace rule [7]. Such methods exploit the temporal continuity of the visual world by assuming that visual experience over short timescales will tend to have invariant identity content. Thus, by associating representations of frames from nearby times, a representation that tolerates whatever transformations occurred in the video may be achieved. Many previous studies verified that such rules can work in simple situations without background clutter, but the presence of visual clutter has remained problematic for this approach. Here we show that temporal association based on large class-specific filters (templates) avoids the problem of clutter. Our system learns in an unsupervised way from natural videos gathered from the internet, and is able to perform a difficult unconstrained face recognition task on natural images (Labeled Faces in the Wild [8]).

}, author = {Qianli Liao and JZ. Leibo and Tomaso Poggio} } @article {226, title = {Unsupervised learning of invariant representations with low sample complexity: the magic of sensory cortex or a new framework for machine learning?}, number = {001}, year = {2014}, month = {03/2014}, abstract = {

The present phase of Machine Learning is characterized by supervised learning algorithms relying on large sets of labeled examples (n{\textrightarrow}$\infty$). The next phase is likely to focus on algorithms capable of learning from very few labeled examples (n{\textrightarrow}1), like humans seem able to do. We propose an approach to this problem and describe the underlying theory, based on the unsupervised, automatic learning of a {\textquotedblleft}good{\textquotedblright} representation for supervised learning, characterized by small sample complexity (n). We consider the case of visual object recognition though the theory applies to other domains. The starting point is the conjecture, proved in specific cases, that image representations which are invariant to translations, scaling and other transformations can considerably reduce the sample complexity of learning. We prove that an invariant and unique (discriminative) signature can be computed for each image patch, I, in terms of empirical distributions of the dot-products between I and a set of templates stored during unsupervised learning. A module performing filtering and pooling, like the simple and complex cells described by Hubel and Wiesel, can compute such estimates. Hierarchical architectures consisting of this basic Hubel-Wiesel moduli inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts. The theory extends existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects/images which is invariant to transformations, stable, and discriminative for recognition{\textemdash}and that this representation may be continuously learned in an unsupervised way during development and visual experience.

}, keywords = {Computer vision, Pattern recognition}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @inbook {1130, title = {Visual cognitive adventures of single neurons in the human medial temporal lobe}, booktitle = {Single neuron studies of the human brain. Probing cognition}, year = {2014}, chapter = {8}, author = {Mormann, F and Matias J. Ison and Quiroga, RQ and Christof Koch and Itzhak Fried and Gabriel Kreiman} } @article {459, title = {When Computer Vision Gazes at Cognition.}, number = {025}, year = {2014}, month = {12/2014}, abstract = {

Joint attention is a core, early-developing form of social interaction. It is based on our ability to discriminate the third party objects that other people are looking at. While it has been shown that people can accurately determine whether another person is looking directly at them versus away, little is known about human ability to discriminate a third person gaze directed towards objects that are further away, especially in unconstraint cases where the looker can move her head and eyes freely. In this paper we address this question by jointly exploring human psychophysics and a cognitively motivated computer vision model, which can detect the 3D direction of gaze from 2D face images. The synthesis of behavioral study and computer vision yields several interesting discoveries. (1) Human accuracy of discriminating targets 8{\deg}-10{\deg} of visual angle apart is around 40\% in a free looking gaze task; (2) The ability to interpret gaze of different lookers vary dramatically; (3) This variance can be captured by the computational model; (4) Human outperforms the current model significantly. These results collectively show that the acuity of human joint attention is indeed highly impressive, given the computational challenge of the natural looking task. Moreover, the gap between human and model performance, as well as the variability of gaze interpretation across different lookers, require further understanding of the underlying mechanisms utilized by humans for this challenging task.

}, author = {Tao Gao and Daniel Harari and Joshua B. Tenenbaum and Shimon Ullman} } @conference {221, title = {Word-level Invariant Representations From Acoustic Waveforms}, booktitle = {INTERSPEECH 2014 - 15th Annual Conf. of the International Speech Communication Association}, year = {2014}, publisher = {International Speech Communication Association (ISCA)}, organization = {International Speech Communication Association (ISCA)}, address = {Singapore}, abstract = {

Extracting discriminant, transformation-invariant features from raw audio signals remains a serious challenge for speech recognition. The issue of speaker variability is central to this problem, as changes in accent, dialect, gender, and age alter the sound waveform of speech units at multiple scales (phonemes, words, or phrases). Approaches for dealing with this variability have typically focused on analyzing the spectral properties of speech at the level of frames, on par with frame-level acoustic modeling usually applied to speech recognition systems. In this paper, we propose a framework for representing speech at the whole-word level and extracting features from the acoustic, temporal domain, without the need for spectral encoding or pre-processing. Leveraging recent work on unsupervised learning of invariant sensory representations, we extract a signature for a word by first projecting its raw waveform onto a set of templates and their transformations, and then forming empirical estimates of the resulting one-dimensional distributions via histograms. The representation and relevant parameters are evaluated for word classification on a series of datasets with increasing speaker-mismatch difficulty, and the results are compared to those of an MFCC-based representation.

}, keywords = {Invariance, Speech Representation, Theories for Intelligence}, url = {http://www.isca-speech.org/archive/interspeech_2014/i14_2385.html}, author = {Stephen Voinea and Chiyuan Zhang and Georgios Evangelopoulos and Lorenzo Rosasco and Tomaso Poggio} } @inbook {1135, title = {Computational models of visual object recognition}, booktitle = {Principles of neural coding}, year = {2013}, author = {Gabriel Kreiman} } @article {1349, title = {Core foundations of abstract geometry}, journal = {Proceedings of National Academy of Sciences of the United States of America}, volume = {110}, year = {2013}, chapter = {14191}, abstract = {

Human adults from diverse cultures share intuitions about the points, lines, and figures of Euclidean geometry. Do children develop these intuitions by drawing on phylogenetically ancient and developmentally precocious geometric representations that guide their navigation and their analysis of object shape? In what way might these early-arising representations support later-developing Euclidean intuitions? To approach these questions, we investigated the relations among young children{\textquoteright}s use of geometry in tasks assessing: navigation; visual form analysis; and the interpretation of symbolic, purely geometric maps. Children{\textquoteright}s navigation depended on the distance and directional relations of the surface layout and predicted their use of a symbolic map with targets designated by surface distances. In contrast, children{\textquoteright}s analysis of visual forms depended on the size-invariant shape relations of objects and predicted their use of the same map but with targets designated by corner angles. Even though the two map tasks used identical instructions and map displays, children{\textquoteright}s performance on these tasks showed no evidence of integrated representations of distance and angle. Instead, young children flexibly recruited geometric representations of either navigable layouts or objects to interpret the same spatial symbols. These findings reveal a link between the early-arising geometric representations that humans share with diverse animals and the flexible geometric intuitions that give rise to human knowledge at its highest reaches. Although young children do not appear to integrate core geometric representations, children{\textquoteright}s use of the abstract geometry in spatial symbols such as maps may provide the earliest clues to the later construction of Euclidean geometry.

}, author = {Moira R Dillon and Yi Huang and Elizabeth S Spelke} } @inbook {218, title = {On Learnability, Complexity and Stability}, booktitle = {Empirical Inference}, year = {2013}, pages = {59 - 69}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, chapter = {7}, address = {Berlin, Heidelberg}, abstract = {

Empirical Inference, Chapter 7

Editors: Bernhard Sch{\"o}lkopf, Zhiyuan Luo and Vladimir Vovk

Abstract:

We consider the fundamental question of learnability of a hypothesis class in the supervised learning setting and in the general learning setting introduced by Vladimir Vapnik. We survey classic results characterizing learnability in terms of suitable notions of complexity, as well as more recent results that establish the connection between learnability and stability of a learning algorithm.

}, isbn = {978-3-642-41135-9}, doi = {10.1007/978-3-642-41136-610.1007/978-3-642-41136-6_7}, url = {http://link.springer.com/10.1007/978-3-642-41136-6}, author = {Villa, Silvia and Lorenzo Rosasco and Tomaso Poggio and Sch{\"o}lkopf, Bernhard and Luo, Zhiyuan and Vovk, Vladimir} } @article {391, title = {The Neural Decoding Toolbox}, year = {2013}, month = {01/2013}, abstract = {

The Neural Decoding Toolbox (NDT) is a MATLAB package that makes it easy to apply population decoding analyses to neural activity. The toolbox is designed in a modular fashion making it easy to try different analyses while keeping a core processing stream intact. Using the toolbox one can analyze data from many different types of recording modalities, including spiking data and EEG/MEG signals. The toolbox also allows for more complex analyses such as testing whether information is contained in a dynamic population code, and assessing whether information represented in an abstract format.

}, url = {http://www.readout.info/}, author = {Ethan Meyers} } @article {2881, title = {Neural Representation Benchmark [code]}, year = {2013}, abstract = {

A key requirement for the development of effective learning representations is their evaluation and comparison to representations we know to be effective. In natural sensory domains, the community has viewed the brain as a source of inspiration and as an implicit benchmark for success. However, it has not been possible to directly test representational learning algorithms directly against the representations contained in neural systems. Here, we propose a new benchmark for visual representations on which we have directly tested the neural representation in multiple visual cortical areas in macaque (utilizing data from [Majaj et al., 2012]), and on which any computer vision algorithm that produces a feature space can be tested. The benchmark measures the effectiveness of the neural or machine representation by computing the classification loss on the ordered eigendecomposition of a kernel matrix [Montavon et al., 2011]. In our analysis we find that the neural representation in visual area IT is superior to visual area V4. In our analysis of representational learning algorithms, we find that three-layer models approach the representational performance of V4 and the algorithm in [Le et al., 2012] surpasses the performance of V4. Impressively, we find that a recent supervised algorithm [Krizhevsky et al., 2012] achieves performance comparable to that of IT for an intermediate level of image variation difficulty, and surpasses IT at a higher difficulty level. We believe this result represents a major milestone: it is the first learning algorithm we have found that exceeds our current estimate of IT representation performance. We hope that this benchmark will assist the community in matching the representational performance of visual cortex and will serve as an initial rallying point for further correspondence between representations derived in brains and machines.

For more information and to download code, etc. please visit the project website - http://dicarlolab.mit.edu/neuralbenchmark

}, author = {Charles F. Cadieu and Ha Hong and Daniel L K Yamins and Nicolas Pinto and Najib J. Majaj and James J. DiCarlo} } @article {223, title = {NSF Science and Technology Centers {\textendash} The Class of 2013}, year = {2013}, month = {11/2013}, publisher = {North America Gender Summit}, address = {Washington, D.C.}, author = {Eaton Lattman and Tomaso Poggio and Robert Westervelt} } @article {385, title = {Object recognition data sets (iCub/IIT)}, year = {2013}, month = {05/2013}, abstract = {

Data set for object recognition and categorization. 10 categories, 40 objects for the training phase. The acquisition size is 640{\texttimes}480 and subsequently cropped to the bounding box of the object according to the kinematics or motion cue. The bounding box is 160{\texttimes}160 in human mode and 320{\texttimes}320 in robot mode. For each object we provide 200 training samples. Each category is trained with 3 objects (600 examples per category).

Click HERE to Download Dataset from IIT website \>

Publications

Fanello, S.R.; Ciliberto, C.; Santoro, M.; Natale, L.; Metta, G.; Rosasco, L.; Odone, F.,{\textquotedblright}iCub World: Friendly Robots Help Building Good Vision Data-Sets,{\textquotedblright} In Proceedings of IEEE Conference on Computer Vision and Pattern Recognition Workshops (CVPR), 2013

Fanello, S. R.; Ciliberto, C.; Natale, L.; Metta, G., {\textquotedblleft}Weakly Supervised Strategies for Natural Object Recognition in Robotics,{\textquotedblright} IEEE International Conference on Robotics and Automation (ICRA). Karlsruhe, Germany, May 6-10, 2013

Fanello, S.R.; Noceti, N.; Metta, G.; Odone, F., {\textquotedblleft}Multi-Class Image Classification: Sparsity Does It Better,{\textquotedblright} International Conference on Computer Vision Theory and Applications (VISAPP), 2013

Ciliberto C.; Smeraldi F.; Natale L.; Metta G., {\textquotedblleft}Online Multiple Instance Learning Applied to Hand Detection in a Humanoid Robot,{\textquotedblright} IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS2011). San Francisco, California, USA, September 25-30, 2011

}, keywords = {Computer vision, object recognition, robotics}, author = {Lorenzo Rosasco} } @proceedings {387, title = {Unsupervised Learning of Invariant Representations in Hierarchical Architectures.}, year = {2013}, month = {11/2013}, abstract = {

Representations that are invariant to translation, scale and other transformations, can considerably reduce the sample complexity of learning, allowing recognition of new object classes from very few examples {\textendash} a hallmark of human recognition. Empirical estimates of one-dimensional projections of the distribution induced by a group of affine transformations are proven to represent a unique and invariant signature associated with an image. We show how projections yielding invariant signatures for future images can be learned automatically, and updated continuously, during unsupervised visual experience. A module performing filtering and pooling, like simple and complex cells as proposed by Hubel and Wiesel, can compute such estimates. Under this view, a pooling stage estimates a one-dimensional probability distribution. Invariance from observations through a restricted window is equivalent to a sparsity property w.r.t. to a transformation, which yields templates that are a) Gabor for optimal simultaneous invariance to translation and scale or b) very specific for complex, class-dependent transformations such as rotation in depth of faces. Hierarchical architectures consisting of this basic Hubel-Wiesel module inherit its properties of invariance, stability, and discriminability while capturing the compositional organization of the visual world in terms of wholes and parts, and are invariant to complex transformations that may only be locally affine. The theory applies to several existing deep learning convolutional architectures for image and speech recognition. It also suggests that the main computational goal of the ventral stream of visual cortex is to provide a hierarchical representation of new objects which is invariant to transformations, stable, and discriminative for recognition {\textendash} this representation may be learned in an unsupervised way from natural visual experience.

Read paper\>

}, keywords = {convolutional networks, Hierarchy, Invariance, visual cortex}, author = {F. Anselmi and JZ. Leibo and Lorenzo Rosasco and Jim Mutch and Andrea Tacchetti and Tomaso Poggio} } @article {437, title = {VIP: A unifying framework for eye-gaze research}, year = {2013}, abstract = {

We have collected the first fixation dataset which captures all 3 VIP factors.

The images were selected from the NUSEF dataset, which contains both neutral and affective images. Out of 758 NUSEF images, 150 were randomly selected. 75 subjects were recruited from a mixture of undergraduate, postgraduate and working adult population. The male and female subjects are recruited separately to ensure an even distribution. They were tasked to view the 150 images in a free-viewing (i.e. without assigned task) or anomaly detection setting. Each image was displayed for 5 seconds, followed by 2 seconds viewing of a gray screen. The images were displayed in random order. Their eye-gaze data was recorded with a binocular infra-red based remote eye-tracking device SMI RED 250. The recording was done at 120Hz. The subjects were seated at 50 centimeters distance from a 22 inch LCD monitor with 1680x1050 resolution. This setup is similar to other ones used in eye-gaze research.

Before start of the viewing experiment, the subjects also provided their demographic data: gender, age-group, ethnicity, religion, field of study/work, highest education qualification, income group, expenditure group and nationality. 3 personality type questions are posed based on the Jung{\textquoteright}s Psychological types. The recorded eye-gaze data were preprocessed with the SMI SDK to extract the fixations from the preferred eye as chosen by the subjects.

Download and copyright

The VIP dataset can be downloaded as a single zip file. VIP dataset is available for research purposes only. By downloading or using the dataset, you are deemed to agree to terms and conditions.

If you are using this dataset, please cite:

A Unifying Framework for Computational Eye-Gaze Research.
Keng-Teck Ma. Terence Sim and Mohan Kankanhalli.
4th International Workshop on Human Behavior Understanding. Barcelona, Spain. 2013.[pdf]

}, url = {http://mmas.comp.nus.edu.sg/VIP.html}, author = {Keng-Teck Ma and Terence Sim and Mohan Kankanhalli} } @article {2813, title = {Avalanche analysis from multielectrode ensemble recordings in cat, monkey, and human cerebral cortex during wakefulness and sleep}, journal = {Frontiers in Physiology}, year = {2012}, abstract = {

Self-organized critical states are found in many natural systems, from earthquakes to forest fires, they have also been observed in neural systems, particularly, in neuronal cultures. However, the presence of critical states in the awake brain remains controversial. Here, we compared avalanche analyses performed on different\ in vivo\ preparations during wakefulness, slow-wave sleep, and REM sleep, using high density electrode arrays in cat motor cortex (96 electrodes), monkey motor cortex and premotor cortex and human temporal cortex (96 electrodes) in epileptic patients. In neuronal avalanches defined from units (up to 160 single units), the size of avalanches never clearly scaled as power-law, but rather scaled exponentially or displayed intermediate scaling. We also analyzed the dynamics of local field potentials (LFPs) and in particular LFP negative peaks (nLFPs) among the different electrodes (up to 96 sites in temporal cortex or up to 128 sites in adjacent motor and premotor cortices). In this case, the avalanches defined from nLFPs displayed power-law scaling in double logarithmic representations, as reported previously in monkey. However, avalanche defined as positive LFP (pLFP) peaks, which are less directly related to neuronal firing, also displayed apparent power-law scaling. Closer examination of this scaling using the more reliable cumulative distribution function (CDF) and other rigorous statistical measures, did not confirm power-law scaling. The same pattern was seen for cats, monkey, and human, as well as for different brain states of wakefulness and sleep. We also tested other alternative distributions. Multiple exponential fitting yielded optimal fits of the avalanche dynamics with bi-exponential distributions. Collectively, these results show no clear evidence for power-law scaling or self-organized critical states in the awake and sleeping brain of mammals, from cat to man.

}, doi = {10.3389/fphys.2012.00302}, url = {http://journal.frontiersin.org/article/10.3389/fphys.2012.00302}, author = {Nima Dehghani and Nicholas Hatsopoulos and Zach Haga and Rebecca Parker and Bradley Greger and Eric Halgren and Sydney Cash and Alain Destexhe} } @article {2264, title = {cnpkg: 3-D Convolutional Network Package for CNS}, year = {2012}, month = {04/2012}, abstract = {

A CNS package for creating 3-D convolutional networks and training them via the backpropagation algorithm.

(assumes CNS is already installed)

}, author = {Jim Mutch and Srini Turaga} } @article {2265, title = {hhpkg: Hodgkin-Huxley Package for CNS}, year = {2012}, month = {04/2012}, abstract = {

A CNS package that defines several types of spiking cells using Hodgkin-Huxley dynamics.

(assumes CNS is already installed)

}, author = {Ulf Knoblich} } @article {2263, title = {HMAX Package for CNS}, year = {2012}, month = {04/2012}, abstract = {

A CNS package that can be used to instantiate a broad class of feedforward object recognition models.

Note: this package is a reorganized and renamed version of the Feature Hierarchy package (fhpkg). The last version of the FH package can be downloaded here. The download also contains a compatible version of CNS.

}, author = {Jim Mutch} } @conference {4886, title = {Learning manifolds with k-means and k-flats}, booktitle = {Advances in Neural Information Processing Systems 25 (NIPS 2012)}, year = {2012}, month = {12/2012}, abstract = {

We study the problem of estimating a manifold from random samples. In partic- ular, we consider piecewise constant and piecewise linear estimators induced by k-means and k-flats, and analyze their performance. We extend previous results for k-means in two separate directions. First, we provide new results for k-means reconstruction on manifolds and, secondly, we prove reconstruction bounds for higher-order approximation (k-flats), for which no known results were previously available. While the results for k-means are novel, some of the technical tools are well-established in the literature. In the case of k-flats, both the results and the mathematical tools are new.

}, url = {https://papers.nips.cc/paper/2012/hash/b20bb95ab626d93fd976af958fbc61ba-Abstract.html}, author = {Guillermo D. Canas and Tomaso Poggio and Lorenzo Rosasco} } @article {2812, title = {Spatiotemporal dynamics of neocortical excitation and inhibition during human sleep}, journal = {Proceedings of the National Academy of Sciences}, year = {2012}, abstract = {

Intracranial recording is an important diagnostic method routinely used in a number of neurological monitoring scenarios. In recent years, advancements in such recordings have been extended to include unit activity of an ensemble of neurons. However, a detailed functional characterization of excitatory and inhibitory cells has not been attempted in human neocortex, particularly during the sleep state. Here, we report that such feature discrimination is possible from high-density recordings in the neocortex by using 2D multielectrode arrays. Successful separation of regular-spiking neurons (or bursting cells) from fast-spiking cells resulted in well-defined clusters that each showed unique intrinsic firing properties. The high density of the array, which allowed recording from a large number of cells (up to 90), helped us to identify apparent monosynaptic connections, confirming the excitatory and inhibitory nature of regular-spiking and fast-spiking cells, thus categorized as putative pyramidal cells and interneurons, respectively. Finally, we investigated the dynamics of correlations within each class. A marked exponential decay with distance was observed in the case of excitatory but not for inhibitory cells. Although the amplitude of that decline depended on the timescale at which the correlations were computed, the spatial constant did not. Furthermore, this spatial constant is compatible with the typical size of human columnar organization. These findings provide a detailed characterization of neuronal activity, functional connectivity at the microcircuit level, and the interplay of excitation and inhibition in the human neocortex.

}, doi = {10.1073/pnas.1109895109}, url = {http://www.pnas.org/content/109/5/1731}, author = {Adrien Peyrache and Nima Dehghani and Emad Eskandar and Joseph Madsen and WS Anderson and Jacob Donoghue and Leigh Hochberg and Eric Halgren and Sydney Cash and Alain Destexhe} } @article {2637, title = {Combined effects of feature-based working memory and feature-based attention on the perception of visual motion direction}, journal = {Journal of Vision}, volume = {11}, year = {2011}, chapter = {11}, author = {Diego Mendoza-Halliday and Megan Schneiderman and Christian Kaul and Julio Martinez-Trujillo} } @article {4065, title = {Combined effects of feature-based working memory and feature-based attention on the perception of visual motion direction}, journal = {Journal of Vision}, volume = {11}, year = {2011}, abstract = {

We investigated whether human subjects{\textquoteright}ability to identify the direction of a brief pulse of coherent motion in a random-dotpattern (RDP) was influenced by: (a) maintaining in working memory the direction of motion of an RDP previously presentedfar from the pulse (feature-based working memory or FBWM,Experiment 1), (b) attending to the direction of an RDPco-occurring with but far from the pulse (feature-based attention or FBA,Experiment 2), and (c) both FBWM and FBA actingsimultaneously (Experiment 3). In thefirst two experiments, pulse direction identification performance was higher when theremembered direction (FBWM) or the direction of the concurrently attended RDP (FBA) matched the pulse direction thanwhen it was opposite. InExperiment 3, performance was highest when both the remembered and the attended directionsmatched the pulse direction (combined effects of FBWM and FBA), it was intermediate when only one of them matched thepulse direction, and it was lowest when neither matched the pulse direction. Our results demonstrate that both feature-based working memory and feature-based attention can individually modulate the perception of motion direction and thatwhen acting together they produce an even larger modulation.

}, keywords = {feature-based attention, motion direction, motion perception, Psychophysics, working memory}, doi = {doi:10.1167/11.1.11}, url = {http://www.journalofvision.org/content/11/1/11}, author = {Diego Mendoza-Halliday and Schneiderman, M. and Kaul, C. and Julio Martinez-Trujillo} } @article {380, title = {A Large Video Database for Human Motion Recognition}, year = {2011}, month = {01/2011}, abstract = {

With nearly one billion online videos viewed everyday, an emerging new frontier in computer vision research is recognition and search in video. While much effort has been devoted to the collection and annotation of large scalable static image datasets containing thousands of image categories, human action datasets lack far behind.

Here we introduce HMDB collected from various sources, mostly from movies, and a small proportion from public databases such as the Prelinger archive, YouTube and Google videos. The dataset contains 6849 clips divided into 51 action categories, each containing a minimum of 101 clips.

The actions categories can be grouped in five types:

  1. General facial actions smile, laugh, chew, talk.
  2. Facial actions with object manipulation: smoke, eat, drink.
  3. General body movements: cartwheel, clap hands, climb, climb stairs, dive, fall on the floor, backhand flip, handstand, jump, pull up, push up, run, sit down, sit up, somersault, stand up, turn, walk, wave.
  4. Body movements with object interaction: brush hair, catch, draw sword, dribble, golf, hit something, kick ball, pick, pour, push something, ride bike, ride horse, shoot ball, shoot bow, shoot gun, swing baseball bat, sword exercise, throw.
  5. Body movements for human interaction: fencing, hug, kick someone, kiss, punch, shake hands, sword fight.

Click HERE to see documentation or to download {\textquoteleft}A Large Video Database for Human Motion Recognition.{\textquoteright} \>

}, author = {E. Garrote and H. Jhuang and H. Huehne and Tomaso Poggio and T. Serre} } @article {381, title = {CNS ({\textquotedblleft}Cortical Network Simulator{\textquotedblright}): a GPU-based framework for simulating cortically-organized networks}, year = {2010}, month = {01/2010}, abstract = {

A general GPU-based framework for the fast simulation of {\textquotedblleft}cortically-organized{\textquotedblright} networks, defined as networks consisting of n-dimensional layers of similar cells.

This is a fairly broad class, including more than just {\textquotedblleft}HMAX{\textquotedblright} models. We have developed specialized CNS\ packages\ for HMAX feature hierarchy models (hmax), convolutional networks (cnpkg), and networks of Hodgkin-Huxley spiking cells (hhpkg).

While CNS is designed for use with a GPU, it can run (much more slowly) without one. It does, however, require MATLAB.

CNS ({\textquotedblleft}Cortical Network Simulator{\textquotedblright})

}, author = {Jim Mutch and Ulf Knoblich and Tomaso Poggio} } @article {382, title = {hmin: A Minimal HMAX Implementation}, year = {2010}, month = {01/2010}, abstract = {

This is a simple reference implementation of HMAX, meant for illustration. It is a single-threaded, CPU-based, pure C++ implementation (but still\ called\ via MATLAB{\textquoteright}s {\textquotedblleft}mex{\textquotedblright} interface).

The package contains C++ classes for layers and filters, and a main program that assembles them to implement one specific model.

See Full Text below for more information and to download.

}, author = {Jim Mutch} } @article {383, title = {System for Mouse Behavior Recognition}, year = {2010}, month = {01/2010}, abstract = {

Neurobehavioural analysis of mouse phenotypes requires the monitoring of mouse behaviour over long periods of time. In this study, we describe a trainable computer vision system enabling the automated analysis of complex mouse behaviours. We provide software and an extensive manually annotated video database used for training and testing the system. Our system performs on par with human scoring, as measured from ground-truth manual annotations of thousands of clips of freely behaving mice. As a validation of the system, we characterized the home-cage behaviours of two standard inbred and two non-standard mouse strains. From these data, we were able to predict in a blind test the strain identity of individual animals with high accuracy. Our video-based software will complement existing sensor-based automated approaches and enable an adaptable, comprehensive, high-throughput, fine-grained, automated analysis of mouse behaviour.

Click here for more information and to download \>

}, author = {E. Garrote and H. Jhuang and V. Khilnani and Tomaso Poggio and T. Serre and X. Yu} } @article {2259, title = {Timing, timing, timing: Fast decoding of object inforrmation from intracranial field potentials in human visual cortex}, year = {2009}, publisher = {Neuron}, abstract = {

Rapid responses along the ventral visual stream in the human brain show selectivity to faces and objects, tolerance to object transformations which can be decoded in single trials.

}, url = {http://klab.tch.harvard.edu/resources/liuetal_timing3.html}, author = {Liu, H and Agam, Y and Joseph Madsen and Gabriel Kreiman} }