@article {5537,
	title = {How does the primate brain combine generative and discriminative computations in vision?},
	journal = {arXiv},
	year = {2024},
	month = {01/2024},
	abstract = {<p>Vision is widely understood as an inference problem. However, two con- trasting conceptions of the inference process have each been influential in research on biological vision as well as the engineering of machine vi- sion. The first emphasizes bottom-up signal flow, describing vision as a largely feedforward, discriminative inference process that filters and transforms the visual information to remove irrelevant variation and rep- resent behaviorally relevant information in a format suitable for down- stream functions of cognition and behavioral control. In this conception, vision is driven by the sensory data, and perception is direct because the processing proceeds from the data to the latent variables of interest. The notion of {\textquotedblleft}inference{\textquotedblright} in this conception is that of the engineering litera- ture on neural networks, where feedforward convolutional neural net- works processing images are said to perform inference. The alternative conception is that of vision as an inference process in Helmholtz{\textquoteright}s sense, where the sensory evidence is evaluated in the context of a generative model of the causal processes that give rise to it. In this conception, vi- sion inverts a generative model through an interrogation of the sensory evidence in a process often thought to involve top-down predictions of sensory data to evaluate the likelihood of alternative hypotheses. The authors include scientists rooted in roughly equal numbers in each of the conceptions and motivated to overcome what might be a false dichotomy between them and engage the other perspective in the realm of theory and experiment. The primate brain employs an unknown algorithm that may combine the advantages of both conceptions. We explain and clar- ify the terminology, review the key empirical evidence, and propose an empirical research program that transcends the dichotomy and sets the stage for revealing the mysterious hybrid algorithm of primate vision.</p>
},
	url = {https://arxiv.org/abs/2401.06005},
	author = {Benjamin Peters and James J. DiCarlo and Todd Gureckis and Ralf Haefner and Leyla Isik and Joshua Tenenbaum and Talia Konkle and Thomas Naselaris and Kimberly Stachenfeld and Zenna Tavares and Doris Tsao and Ilker Yildirim and Nikolaus Kriegeskorte}
}
@article {5312,
	title = {An empirical assay of view-invariant object learning in humans and comparison with baseline image-computable models},
	journal = {bioRxiv},
	year = {2023},
	month = {01/2023},
	abstract = {<p>How humans learn new visual objects is a longstanding scientific problem. Previous work has led to a diverse collection of models for how it is accomplished, but a current limitation in the field is a lack of empirical benchmarks which can be used to evaluate and compare specific models against each other. Here, we use online psychophysics to measure human behavioral learning trajectories over a set of tasks involving novel 3D objects. Consistent with intuition, these results show that humans generally require very few images (≈ 6) to approach their asymptotic accuracy, find some object discriminations more easy to learn than others, and generalize quite well over a range of image transformations after even one view of each object. We then use those data to develop benchmarks that may be used to evaluate a learning model{\textquoteright}s similarity to humans. We make these data and benchmarks publicly available [GitHub], and, to our knowledge, they are currently the largest publicly-available collection of learning-related psychophysics data in humans. Additionally, to serve as baselines for those benchmarks, we implement and test a large number of baseline models (n=1,932), each based on a standard cognitive theory of learning: that humans re-represent images in a fixed, Euclidean space, then learn linear decision boundaries in that space to identify objects in future images. We find some of these baseline models make surprisingly accurate predictions. However, we also find reliable prediction gaps between all baseline models and humans, particularly in the few-shot learning setting.</p>
},
	url = {https://www.biorxiv.org/content/10.1101/2022.12.31.522402v1},
	author = {Michael J. Lee and James J. DiCarlo}
}
@article {5527,
	title = {Robustified ANNs Reveal Wormholes Between Human Category Percepts},
	journal = {arXiv},
	year = {2023},
	month = {10/2023},
	abstract = {<p>The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations {\textendash} and locally stable in general {\textendash} this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same {\textquotedblleft}human-presumed-stable{\textquotedblright} regime, we find that robustified ANNs reliably dis- cover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. These observations sug- gest that for arbitrary starting points in image space, there exists a set of nearby {\textquotedblleft}wormholes{\textquotedblright}, each leading the subject from their current category perceptual state into a semantically very different state. Moreover, contemporary ANN models of biological visual processing are now accurate enough to consistently guide us to those portals.</p>
},
	url = {https://arxiv.org/abs/2308.06887},
	author = {Guy Gaziv and Michael J. Lee and James J. DiCarlo}
}
@conference {5528,
	title = {Strong and Precise Modulation of Human Percepts via Robustified ANNs},
	booktitle = {NeurIPS 2023},
	year = {2023},
	month = {12/2023},
	abstract = {<p>The visual object category reports of artificial neural networks (ANNs) are notoriously sensitive to tiny, adversarial image perturbations. Because human category reports (aka human percepts) are thought to be insensitive to those same small-norm perturbations {\textendash} and locally stable in general {\textendash} this argues that ANNs are incomplete scientific models of human visual perception. Consistent with this, we show that when small-norm image perturbations are generated by standard ANN models, human object category percepts are indeed highly stable. However, in this very same {\textquotedblleft}human-presumed-stable{\textquotedblright} regime, we find that robustified ANNs reliably dis- cover low-norm image perturbations that strongly disrupt human percepts. These previously undetectable human perceptual disruptions are massive in amplitude, approaching the same level of sensitivity seen in robustified ANNs. Further, we show that robustified ANNs support precise perceptual state interventions: they guide the construction of low-norm image perturbations that strongly alter human category percepts toward specific prescribed percepts. In sum, these contemporary models of biological visual processing are now accurate enough to guide strong and precise interventions on human perception.</p>
},
	url = {https://proceedings.neurips.cc/paper_files/paper/2023/hash/d00904cebc0d5b69fada8ad33d0f1422-Abstract-Conference.html},
	author = {Guy Gaziv and Michael J. Lee and James J. DiCarlo}
}
@article {5283,
	title = {Adversarially trained neural representations may already be as robust as corresponding biological neural representations},
	journal = {arXiv},
	year = {2022},
	month = {06/2022},
	abstract = {<p>Visual systems of primates are the gold standard of robust perception. There is thus a general belief that mimicking the neural representations that underlie those systems will yield artificial visual systems that are adversarially robust. In this work, we develop a method for performing adversarial visual attacks directly on primate brain activity. We then leverage this method to demonstrate that the above-mentioned belief might not be well founded. Specifically, we report that the biological neurons that make up visual systems of primates exhibit susceptibility to adversarial perturbations that is comparable in magnitude to existing (robustly trained) artificial neural networks.</p>
},
	author = {Chong Guo and Michael J. Lee and Guillaume Leclerc and Joel Dapello and Yug Rao and Aleksander Madry and James J. DiCarlo}
}
@article {5284,
	title = {Aligning Model and Macaque Inferior Temporal Cortex Representations Improves Model-to-Human Behavioral Alignment and Adversarial Robustness},
	journal = {bioRxiv},
	year = {2022},
	month = {07/2022},
	abstract = {<p>While some state-of-the-art artificial neural network systems in computer vision are strikingly accurate models of the corresponding primate visual processing, there are still many discrepancies between these models and the behavior of primates on object recognition tasks. Many current models suffer from extreme sensitivity to adversarial attacks and often do not align well with the image-by-image behavioral error patterns observed in humans. Previous research has provided strong evidence that primate object recognition behavior can be very accurately predicted by neural population activity in the inferior temporal (IT) cortex, a brain area in the late stages of the visual processing hierarchy. Therefore, here we directly test whether making the late stage representations of models more similar to that of macaque IT produces new models that exhibit more robust, primate-like behavior. We conducted chronic, large-scale multi-electrode recordings across the IT cortex in six non-human primates (rhesus macaques). We then use these data to fine-tune (end-to-end) the model {\textquotedblleft}IT{\textquotedblright} representations such that they are more aligned with the biological IT representations, while preserving accuracy on object recognition tasks. We generate a cohort of models with a range of IT similarity scores validated on held-out animals across two image sets with distinct statistics. Across a battery of optimization conditions, we observed a strong correlation between the models{\textquoteright} IT-likeness and alignment with human behavior, as well as an increase in its adversarial robustness. We further assessed the limitations of this approach and find that the improvements in behavioral alignment and adversarial robustness generalize across different image statistics, but not to object categories outside of those covered in our IT training set. Taken together, our results demonstrate that building models that are more aligned with the primate brain leads to more robust and human-like behavior, and call for larger neural data-sets to further augment these gains.</p>
},
	author = {Joel Dapello and Kohitij Kar and Martin Schrimpf and Robert Geary and Michael Ferguson and David D. Cox and James J. DiCarlo}
}
@conference {5298,
	title = {Primate Inferotemporal Cortex Neurons Generalize Better to Novel Image Distributions Than Analogous Deep Neural Networks Units},
	booktitle = {NeurIPS},
	year = {2022},
	month = {10/2022},
	abstract = {<p>Humans are successfully able to recognize objects in a variety of image distributions. Today{\textquoteright}s artificial neural networks (ANNs), on the other hand, struggle to recognize objects in many image domains, especially those different from the training distribution. It is currently unclear which parts of the ANNs could be improved in order to close this generalization gap. In this work, we used recordings from primate high-level visual cortex (IT) to isolate whether ANNs lag behind primate generalization capabilities because of their encoder (transformations up to the penultimate layer), or their decoder (linear transformation into class labels). Specifically, we fit a linear decoder on images from one domain and evaluate transfer performance on twelve held-out domains, comparing fitting on primate IT representations vs. representations in ANN penultimate layers. To fairly compare, we scale the number of each ANN{\textquoteright}s units so that its in-domain performance matches that of the sampled IT population (i.e. 71 IT neural sites, 73\% binary-choice accuracy). We find that the sampled primate population achieves, on average, 68\% performance on the held-out-domains. Comparably sampled populations from ANN model units generalize less well, maintaining on average 60\%. This is independent of the number of sampled units: models{\textquoteright} out-of-domain accuracies consistently lag behind primate IT. These results suggest that making ANN model units more like primate IT will improve the generalization performance of ANNs.</p>
},
	url = {https://openreview.net/forum?id=iPF7mhoWkOl},
	author = {Ayu Marliawaty I Gusti Bagus and Tiago Marques and Sachi Sanghavi and James J. DiCarlo and Martin Schrimpf}
}
@article {5066,
	title = { Chemogenetic suppression of macaque V4 neurons produces retinotopically specific deficits in downstream IT neural activity patterns and core object recognition behavior},
	journal = {Journal of Vision},
	volume = {21},
	year = {2021},
	month = {09/2021},
	chapter = {2489},
	abstract = {<p>Distributed activity patterns across multiple brain areas (e.g., V4, IT) enable primates to accurately identify visual objects. To strengthen our inferences about the causal role of underlying brain circuits, it is necessary to develop targeted neural perturbation strategies that enable discrimination amongst competing models. To probe the role of area V4 in core object recognition, we expressed inhibitory DREADDs in neurons within a 5x5 mm subregion of V4 cortex via multiple viral injections (AAV8-hSyn-hM4Di-mCherry; two macaques). To assay for successful neural suppression, we recorded from a multi-electrode array implanted over the transfected V4. We also recorded from multi-electrode arrays in the IT cortex (the primary feedforward target of V4), while simultaneously measuring the monkeys{\textquoteright} behavior during object discrimination tasks. We found that systemic (intramuscular) injection of the DREADDs activator (CNO) produced reversible reductions (~20\%) in image-evoked V4 responses compared to the control condition (saline injections). Monkeys showed significant behavioral performance deficits upon CNO injections (compared to saline), which were larger when the object position overlapped with the RF estimates of the transfected V4 neurons. This is consistent with the hypothesis that the suppressed V4 neurons are critical to this behavior. Furthermore, we observed commensurate deficits in the linearly-decoded estimates of object identity from the IT population activity (post-CNO). To model the perturbed brain circuitry, we used a primate brain-mapped artificial neural network (ANN) model (CORnet-S) that supports object recognition. We {\textquotedblleft}lesioned{\textquotedblright} the model{\textquoteright}s corresponding V4 subregion by modifying its weights such that the responses matched a subset of our experimental V4 measurements (post-CNO). Indeed, the lesioned model better predicted the measured (held-out) V4 and IT responses (post-CNO), compared to the model{\textquoteright}s non-lesioned version, validating our approach. In the future, our approach allows us to discriminate amongst competing mechanistic brain models, while the data provides constraints to guide more accurate alternatives.</p>
},
	doi = {10.1167/jov.21.9.2489},
	url = {https://jov.arvojournals.org/article.aspx?articleid=2777218},
	author = {Kohitij Kar and Martin Schrimpf and Kailyn Schmidt and James J. DiCarlo}
}
@conference {5068,
	title = {Combining Different V1 Brain Model Variants to Improve Robustness to Image Corruptions in CNNs},
	booktitle = {NeurIPS 2021},
	year = {2021},
	month = {12/2021},
	abstract = {<p>While some convolutional neural networks (CNNs) have surpassed human visual abilities in object classification, they often struggle to recognize objects in images corrupted with different types of common noise patterns, highlighting a major limitation of this family of models. Recently, it has been shown that simulating a primary visual cortex (V1) at the front of CNNs leads to small improvements in robustness to these image perturbations. In this study, we start with the observation that different variants of the V1 model show gains for specific corruption types. We then build a new model using an ensembling technique, which combines multiple individual models with different V1 front-end variants. The model ensemble leverages the strengths of each individual model, leading to significant improvements in robustness across all corruption categories and outperforming the base model by 38\% on average. Finally, we show that using distillation it is possible to partially compress the knowledge in the ensemble model into a single model with a V1 front-end. While the ensembling and distillation techniques used here are hardly biologically-plausible, the results presented here demonstrate that by combining the specific strengths of different neuronal circuits in V1 it is possible to improve the robustness of CNNs for a wide range of perturbations.</p>
},
	url = {https://nips.cc/Conferences/2021/ScheduleMultitrack?event=41268},
	author = {Avinash Baidya and Joel Dapello and James J. DiCarlo and Tiago Marques}
}
@article {5070,
	title = {Computational models of category-selective brain regions enable high-throughput tests of selectivity},
	journal = {Nature Communications},
	volume = {12},
	year = {2021},
	month = {12/2021},
	abstract = {<p>Cortical regions apparently selective to faces, places, and bodies have provided important evidence for domain-specific theories of human cognition, development, and evolution. But claims of category selectivity are not quantitatively precise and remain vulnerable to empirical refutation. Here we develop artificial neural network-based encoding models that accurately predict the response to novel images in the fusiform face area, parahippocampal place area, and extrastriate body area, outperforming descriptive models and experts. We use these models to subject claims of category selectivity to strong\&nbsp;tests, by screening for and synthesizing images predicted to produce high responses. We find that these high-response-predicted images are all unambiguous members of the hypothesized preferred category for each region. These results provide accurate, image-computable encoding models of each category-selective region, strengthen evidence for domain specificity in the brain, and point the way for future research characterizing the functional organization of the brain with unprecedented computational precision.</p>
},
	doi = {10.1038/s41467-021-25409-6},
	url = {https://www.nature.com/articles/s41467-021-25409-6},
	author = {N. Apurva Ratan Murty and Pouya Bashivan and Abate, Alex and James J. DiCarlo and Nancy Kanwisher}
}
@article {5074,
	title = {Fast Recurrent Processing via Ventrolateral Prefrontal Cortex Is Needed by the Primate Ventral Stream for Robust Core Visual Object Recognition},
	journal = {Neuron},
	volume = {109},
	year = {2021},
	month = {01/2021},
	pages = {164 - 176.e5},
	abstract = {<p>Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core object recognition require additional time to develop for specific, {\textquotedblleft}late-solved{\textquotedblright} images. This suggests the necessity of recurrent processing in these computations. Which brain circuits are responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventrolateral prefrontal cortex (vlPFC) is a critical recurrent node in this system, here, we pharmacologically inactivated parts of vlPFC and simultaneously measured IT activity while monkeys performed object discrimination tasks. vlPFC inactivation deteriorated the quality of late-phase (\&gt;150\&nbsp;ms from image onset) IT population code and produced commensurate behavioral deficits for late-solved images. Finally, silencing vlPFC caused the monkeys{\textquoteright} IT activity and behavior to become more like those produced by feedforward-only ventral stream models. Together with prior work, these results implicate fast recurrent processing through vlPFC as critical to producing behaviorally sufficient object representations in IT.</p>
},
	issn = {08966273},
	doi = {10.1016/j.neuron.2020.09.035},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627320307595},
	author = {Kar, Kohitij and James J. DiCarlo}
}
@article {5079,
	title = {Multi-scale hierarchical neural network models that bridge from single neurons in the primate primary visual cortex to object recognition behavior},
	journal = {bioRxiv},
	year = {2021},
	month = {08/2021},
	abstract = {<p>Primate visual object recognition relies on the representations in cortical areas at the top of the ventral stream that are computed by a complex, hierarchical network of neural populations. While recent work has created reasonably accurate image-computable hierarchical neural network models of those neural stages, those models do not yet bridge between the properties of individual neurons and the overall emergent behavior of the ventral stream. One reason we cannot yet do this is that individual artificial neurons in multi-stage models have not been shown to be functionally similar to individual biological neurons. Here, we took an important first step by building and evaluating hundreds of hierarchical neural network models in how well their artificial single neurons approximate macaque primary visual cortical (V1) neurons. We found that single neurons in certain models are surprisingly similar to their biological counterparts and that the distributions of single neuron properties, such as those related to orientation and spatial frequency tuning, approximately match those in macaque V1. Critically, we observed that hierarchical models with V1 stages that better match macaque V1 at the single neuron level are also more aligned with human object recognition behavior. Finally, we show that an optimized classical neuroscientific model of V1 is more functionally similar to primate V1 than all of the tested multi-stage models, suggesting room for further model improvements with tangible payoffs in closer alignment to human behavior. These results provide the first multi-stage, multi-scale models that allow our field to ask precisely how the specific properties of individual V1 neurons relate to recognition behavior.</p>
},
	author = {Tiago Marques and Martin Schrimpf and James J. DiCarlo}
}
@conference {4532,
	title = {Evidence that recurrent pathways between the prefrontal and inferior temporal cortex is critical during core object recognition },
	booktitle = {COSYNE},
	year = {2020},
	month = {02/2020},
	address = {Denver, Colorado, USA},
	author = {Kohitij Kar and James J. DiCarlo}
}
@article {4659,
	title = {Fast Recurrent Processing via Ventrolateral Prefrontal Cortex Is Needed by the Primate Ventral Stream for Robust Core Visual Object Recognition},
	journal = {Neuron},
	year = {2020},
	month = {10/2020},
	abstract = {<p>Distributed neural population spiking patterns in macaque inferior temporal (IT) cortex that support core object recognition require additional time to develop for specific, {\textquoteleft}{\textquoteleft}late-solved{\textquoteright}{\textquoteright} images. This suggests the necessity of recurrent processing in these computations. Which brain circuits are responsible for computing and transmitting these putative recurrent signals to IT? To test whether the ventrolateral prefrontal cortex (vlPFC) is a critical recurrent node in this system, here, we pharmacologically inactivated parts of vlPFC and simultaneously measured IT activity while monkeys performed object discrimination tasks. vlPFC inactivation deteriorated the quality of late-phase (\&gt;150 ms from image onset) IT population code and produced commensurate behavioral deficits for late-solved images. Finally, silencing vlPFC caused the monkeys{\textquoteright} IT activity and behavior to become more like those produced by feedforward-only ventral stream models. Together with prior work, these results implicate fast recurrent processing through vlPFC as critical to producing behaviorally sufficient object representations in IT.</p>
},
	issn = {08966273},
	doi = {10.1016/j.neuron.2020.09.035},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S0896627320307595},
	author = {Kohitij Kar and James J. DiCarlo}
}
@conference {4527,
	title = {Hierarchical neural network models that more closely match primary visual cortex tend to better explain higher level visual cortical responses },
	booktitle = {COSYNE},
	year = {2020},
	month = {02/2020},
	address = {Denver, Colorado, USA},
	author = {Tiago Marques and Martin Schrimpf and James J. DiCarlo}
}
@article {4600,
	title = {The inferior temporal cortex is a potential cortical precursor of orthographic processing in untrained monkeys},
	journal = {Nature Communications},
	volume = {11},
	year = {2020},
	month = {08/2020},
	abstract = {<p>The ability to recognize written letter strings is foundational to human reading, but the underlying neuronal mechanisms remain largely unknown. Recent behavioral research in baboons suggests that non-human primates may provide an opportunity to investigate this question. We recorded the activity of hundreds of neurons in V4 and the inferior temporal cortex (IT) while na{\"\i}ve macaque monkeys passively viewed images of letters, English words and non-word strings, and tested the capacity of those neuronal representations to support a battery of orthographic processing tasks. We found that simple linear read-outs of IT (but not V4) population responses achieved high performance on all tested tasks, even matching the performance and error patterns of baboons on word classification. These results show that the IT cortex of untrained primates can serve as a precursor of orthographic processing, suggesting that the acquisition of reading in humans relies on the recycling of a brain network evolved for other visual functions.</p>
},
	doi = {10.1038/s41467-020-17714-3},
	url = {http://www.nature.com/articles/s41467-020-17714-3},
	author = {Rishi Rajalingham and Kohitij Kar and Sachi Sanghavi and Dehaene, Stanislas and James J. DiCarlo}
}
@article {4810,
	title = {Integrative Benchmarking to Advance Neurally Mechanistic Models of Human Intelligence},
	journal = {Neuron},
	volume = {108},
	year = {2020},
	month = {11/2020},
	pages = {413 - 423},
	issn = {08966273},
	doi = {10.1016/j.neuron.2020.07.040},
	url = {https://linkinghub.elsevier.com/retrieve/pii/S089662732030605X},
	author = {Martin Schrimpf and Kubilius, Jonas and Lee, Michael J. and N. Apurva Ratan Murty and Ajemian, Robert and James J. DiCarlo}
}
@proceedings {4692,
	title = {Simulating a Primary Visual Cortex at the Front of CNNs Improves Robustness to Image Perturbations},
	year = {2020},
	month = {12/2020},
	abstract = {<p>Current state-of-the-art object recognition models are largely based on convolutional neural network (CNN) architectures, which are loosely inspired by the primate visual system. However, these CNNs can be fooled by imperceptibly small, explicitly crafted perturbations, and struggle to recognize objects in corrupted images that are easily recognized by humans. Here, by making comparisons with primate neural data, we first observed that CNN models with a neural hidden layer that better matches primate primary visual cortex (V1) are also more robust to adversarial attacks. Inspired by this observation, we developed VOneNets, a new class of hybrid CNN vision models. Each VOneNet contains a fixed weight neural network front-end that simulates primate V1, called the VOneBlock, followed by a neural network back-end adapted from current CNN vision models. The VOneBlock is based on a classical neuroscientific model of V1: the linear-nonlinear-Poisson model, consisting of a biologically-constrained Gabor filter bank, simple and complex cell nonlinearities, and a V1 neuronal stochasticity generator. After training, VOneNets retain high ImageNet performance, but each is substantially more robust, outperforming the base CNNs and state-of-the-art methods by 18\% and 3\%, respectively, on a conglomerate benchmark of perturbations comprised of white box adversarial attacks and common image corruptions. Finally, we show that all components of the VOneBlock work in synergy to improve robustness. While current CNN architectures are arguably brain-inspired, the results presented here demonstrate that more precisely mimicking just one stage of the primate visual system leads to new gains in ImageNet-level computer vision applications.</p>

<p>Github:<a href="https://github.com/dicarlolab/vonenet"> https://github.com/dicarlolab/vonenet</a></p>
},
	url = {https://proceedings.neurips.cc/paper/2020/hash/98b17f068d5d9b7668e19fb8ae470841-Abstract.html},
	author = {Joel Dapello and Tiago Marques and Martin Schrimpf and Franziska Geiger and David Cox and James J. DiCarlo}
}
@conference {4528,
	title = {Temporal information for action recognition only needs to be integrated at a choice level in neural networks and primates },
	booktitle = {COSYNE},
	year = {2020},
	month = {02/2020},
	address = {Denver, CO, USA},
	author = {Martin Schrimpf and Fukushi Sato and Sachi Sanghavi and James J. DiCarlo}
}
@article {4632,
	title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation},
	journal = {arXiv},
	year = {2020},
	month = {07/2020},
	type = {Preprint},
	abstract = {<p>We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.</p>
},
	url = {https://arxiv.org/abs/2007.04954},
	author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins}
}
@article {4633,
	title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation},
	year = {2020},
	month = {07/2020},
	abstract = {<h3>TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology</h3>

<p>A TDW simulation consists of two components: a) the <strong>Build</strong>, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the <strong>Controller</strong>, an external Python interface to communicate with the build.</p>

<p>Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.</p>

<p>TDW provides researchers with:</p>

<ul>
	<li>A general, flexible design that does not impose constraints on the types of use-cases it can support, nor force any particular metaphor on the user.</li>
	<li>Support for multiple modalities -- visual rendering with near-photoreal image quality, coupled with superior audio rendering fidelity.</li>
	<li>A comprehensive, highly extensible and thoroughly documented command and control Python API.</li>
	<li>Multiple paradigms for object interaction, capable of generating physically-realistic behavior.</li>
</ul>

<p>TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.</p>

<p>Find out more about ThreeDWorld on the project weobsite using the link below.</p>
},
	url = {http://www.threedworld.org/},
	author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan}
}
@conference {4323,
	title = {Are topographic deep convolutional neural networks better models of the ventral visual stream?},
	booktitle = {Conference on Cognitive Computational Neuroscience},
	year = {2019},
	author = {K.M. Jozwik and Lee, H. and Nancy Kanwisher and James J. DiCarlo}
}
@proceedings {4379,
	title = {Brain-Like Object Recognition with High-Performing Shallow Recurrent ANNs},
	year = {2019},
	month = {10/2019},
	address = {Vancouver, Canada},
	abstract = {<p>Deep convolutional artificial neural networks (ANNs) are the leading class of candidate models of the mechanisms of visual processing in the primate ventral stream. While initially inspired by brain anatomy, over the past years, these ANNs have evolved from a simple eight-layer architecture in AlexNet to extremely deep and branching architectures, demonstrating increasingly better object categorization performance, yet bringing into question how brain-like they still are. In particular, typical deep models from the machine learning community are often hard to map onto the brain{\textquoteright}s anatomy due to their vast number of layers and missing biologically-important connections, such as recurrence. Here we demonstrate that better anatomical alignment to the brain and high performance on machine learning as well as neuroscience measures do not have to be in contradiction. We developed CORnet-S, a shallow ANN with four anatomically mapped areas and recurrent connectivity, guided by Brain-Score, a new large-scale composite of neural and behavioral benchmarks for quantifying the functional fidelity of models of the primate ventral visual stream. Despite being significantly shallower than most models, CORnet-S is the top model on Brain-Score and outperforms similarly compact models on ImageNet. Moreover, our extensive analyses of CORnet-S circuitry variants reveal that recurrence is the main predictive factor of both Brain- Score and ImageNet top-1 performance. Finally, we report that the temporal evolution of the CORnet-S "IT" neural population resembles the actual monkey IT population dynamics. Taken together, these results establish CORnet-S, a compact, recurrent ANN, as the current best model of the primate ventral visual stream.</p>
},
	author = {Jonas Kubilius and Martin Schrimpf and Kohitij Kar and Rishi Rajalingham and Ha Hong and Najib J. Majaj and Elias B. Issa and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Aran Nayebi and Daniel Bear and Daniel L K Yamins and James J. DiCarlo}
}
@article {4141,
	title = {Evidence that recurrent circuits are critical to the ventral stream{\textquoteright}s execution of core object recognition behavior},
	journal = {Nature Neuroscience},
	year = {2019},
	month = {04/2019},
	abstract = {<p>Non-recurrent deep convolutional neural networks (DCNNs) are currently the best models of core object recognition; a behavior supported by the densely recurrent primate ventral stream, culminating in the inferior temporal (IT) cortex. Are these recurrent circuits critical to the ventral stream{\textquoteright}s execution of this behavior? We reasoned that, if recurrence is critical, then primates should outperform feedforward-only DCNNs for some images, and that these images should require additional processing time beyond the feedforward IT response. Here we first used behavioral methods to discover hundreds of these {\textquotedblleft}challenge{\textquotedblright} images. Second, using large- scale IT electrophysiology in animals performing core recognition tasks, we observed that behaviorally-sufficient, linearly-decodable object identity solutions emerged ~30ms (on average) later in IT for challenge images compared to DCNN and primate performance-matched {\textquotedblleft}control{\textquotedblright} images. We observed these same late solutions even during passive viewing. Third, consistent with a failure of feedforward computations, the behaviorally-critical late-phase IT population response patterns evoked by the challenge images were poorly predicted by DCNN activations. Interestingly, very deep CNNs as well as not-so-deep but recurrent CNNs better predicted these late IT responses, suggesting a functional equivalence between additional nonlinear transformations and recurrence. Our results argue that automatically-evoked recurrent circuits are critical even for rapid object identification. By precisely comparing current DCNNs, primate behavior and IT population dynamics, we provide guidance for future recurrent model development.</p>
},
	doi = {10.1038/s41593-019-0392-5},
	url = {https://www.nature.com/articles/s41593-019-0392-5},
	author = {Kohitij Kar and Jonas Kubilius and Kailyn Schmidt and Elias B. Issa and James J. DiCarlo}
}
@conference {4531,
	title = {Evidence that recurrent pathways between the prefrontal and inferior temporal cortex is critical during core object recognition },
	booktitle = {Society for Neuroscience},
	year = {2019},
	month = {10/2019},
	address = {Chicago, IL, USA},
	author = {Kohitij Kar and James J. DiCarlo}
}
@conference {4530,
	title = {A meta-analysis of ANNs as models of primate V1 },
	booktitle = {Bernstein},
	year = {2019},
	month = {09/2019},
	address = {Berlin, Germany},
	author = {Tiago Marques and James J. DiCarlo}
}
@article {4143,
	title = {Neural Population Control via Deep Image Synthesis},
	journal = {Science},
	volume = {364},
	year = {2019},
	month = {05/2019},
	abstract = {Particular deep artificial neural networks (ANNs) are today{\textquoteright}s most accurate models of the primate brain{\textquoteright}s ventral visual stream. Here we report that, using an ANN-driven image synthesis method, new luminous power patterns (i.e. images) can be applied to the primate retinae to predictably push the spiking activity of targeted V4 neural sites beyond naturally occurring levels. More importantly, this method, while not yet perfect, achieves unprecedented independent control of the activity state of entire populations of V4 neural sites, even those with overlapping receptive fields. These results show how the knowledge embedded in today{\textquoteright}s ANN models might be used to noninvasively set desired internal brain states at neuron-level resolution, and suggest that more accurate ANN models would produce even more accurate control.
},
	doi = {10.1126/science.aav9436 },
	url = {https://science.sciencemag.org/content/364/6439/eaav9436},
	author = {Pouya Bashivan and Kohitij Kar and James J. DiCarlo}
}
@conference {4322,
	title = {To find better neural network models of human vision, find better neural network models of primate vision},
	booktitle = {BioRxiv},
	year = {2019},
	abstract = {<p>Specific deep artificial neural networks (ANNs) are the current best models of ventral visual processing and object recognition behavior in monkeys. We here explore whether models of non-human primate vision generalize to visual processing in the human primate brain. Specifically, we asked if model match to monkey IT is a predictor of model match to human IT, even when scoring those matches on different images. We found that the model match to monkey IT is a positive predictor of the model match to human IT (R = 0.36), and that this approach outperforms the current standard predictor of model accuracy on ImageNet. This suggests a more powerful approach for pre-selecting models as hypotheses of human brain processing.</p>
},
	url = {https://www.biorxiv.org/content/10.1101/688390v1.full},
	author = {K.M. Jozwik and Martin Schrimpf and Nancy Kanwisher and James J. DiCarlo}
}
@article {4294,
	title = {Brain-Score: Which Artificial Neural Network for Object Recognition is most Brain-Like?},
	journal = {bioRxiv preprint},
	year = {2018},
	abstract = {<p>The internal representations of early deep artificial neural networks (ANNs) were found to be remarkably similar to the internal neural representations measured experimentally in the primate brain. Here we ask, as deep ANNs have continued to evolve, are they becoming more or less brain-like? ANNs that are most functionally similar to the brain will contain mechanisms that are most like those used by the brain. We therefore developed <em>Brain-Score</em> {\textendash} a composite of multiple neural and behavioral benchmarks that score any ANN on how similar it is to the brain{\textquoteright}s mechanisms for core object recognition {\textendash} and we deployed it to evaluate a wide range of state-of-the-art deep ANNs. Using this scoring system, we here report that: (1) DenseNet-169, CORnet-S and ResNet-101 are the most brain-like ANNs. There remains considerable variability in neural and behavioral responses that is not predicted by any ANN, suggesting that no ANN model has yet captured all the relevant mechanisms. (3) Extending prior work, we found that gains in ANN ImageNet performance led to gains on Brain-Score. However, correlation weakened at <em>>=</em> 70\% top-1 ImageNet performance, suggesting that additional guidance from neuroscience is needed to make further advances in capturing brain mechanisms. (4) We uncovered smaller (i.e. less complex) ANNs that are more brain-like than many of the best-performing ImageNet models, which suggests the opportunity to simplify ANNs to better understand the ventral stream. The scoring system used here is far from complete. However, we propose that evaluating and tracking model-benchmark correspondences through a Brain-Score that is regularly updated with new brain data is an exciting opportunity: experimental benchmarks can be used to guide machine network evolution, and machine networks are mechanistic hypotheses of the brain{\textquoteright}s network and thus drive next experiments. To facilitate both of these, we release Brain-Score.org: a platform that hosts the neural and behavioral benchmarks, where ANNs for visual processing can be submitted to receive a Brain-Score and their rank relative to other models, and where new experimental data can be naturally incorporated.</p>
},
	keywords = {computational neuroscience, deep learning, Neural Networks, object recognition, ventral stream},
	doi = {10.1101/407007},
	url = {https://www.biorxiv.org/content/10.1101/407007v1},
	author = {Martin Schrimpf and Jonas Kubilius},
	editor = {Ha Hong and Najib J. Majaj and Rishi Rajalingham and Elias B. Issa and Kohitij Kar and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Daniel L K Yamins and James J. DiCarlo}
}
@article {3881,
	title = {Single units in a deep neural network functionally correspond with neurons in the brain: preliminary results},
	year = {2018},
	month = {11/2018},
	abstract = {<div>
<div>
<div>
<p>Deep neural networks have been shown to predict neural responses in higher visual cortex. The mapping from the model to a neuron in the brain occurs through a linear combination of many units in the model, leaving open the question of whether there also exists a correspondence at the level of individual neurons. Here we show that there exist many one-to-one mappings between single units in a deep neural network model and neurons in the brain. We show that this correspondence at the single- unit level is ubiquitous among state-of-the-art deep neural networks, and grows more pronounced for models with higher performance on a large-scale visual recognition task. Comparing matched populations{\textemdash}in the brain and in a model{\textemdash}we demonstrate a further correspondence at the level of the population code: stimulus category can be partially decoded from real neural responses using a classifier trained purely on a matched population of artificial units in a model. This provides a new point of investigation for phenomena which require fine-grained mappings between deep neural networks and the brain.</p>
</div>
</div>
</div>
},
	author = {Luke Arend and Yena Han and Martin Schrimpf and Pouya Bashivan and Kohitij Kar and Tomaso Poggio and James J. DiCarlo and Xavier Boix}
}
@article {2881,
	title = {Neural Representation Benchmark [code]},
	year = {2013},
	abstract = {<p>A key requirement for the development of effective learning representations is their evaluation and comparison to representations we know to be effective. In natural sensory domains, the community has viewed the brain as a source of inspiration and as an implicit benchmark for success. However, it has not been possible to directly test representational learning algorithms directly against the representations contained in neural systems. Here, we propose a new benchmark for visual representations on which we have directly tested the neural representation in multiple visual cortical areas in macaque (utilizing data from [Majaj et al., 2012]), and on which any computer vision algorithm that produces a feature space can be tested. The benchmark measures the effectiveness of the neural or machine representation by computing the classification loss on the ordered eigendecomposition of a kernel matrix [Montavon et al., 2011]. In our analysis we find that the neural representation in visual area IT is superior to visual area V4. In our analysis of representational learning algorithms, we find that three-layer models approach the representational performance of V4 and the algorithm in [Le et al., 2012] surpasses the performance of V4. Impressively, we find that a recent supervised algorithm [Krizhevsky et al., 2012] achieves performance comparable to that of IT for an intermediate level of image variation difficulty, and surpasses IT at a higher difficulty level. We believe this result represents a major milestone: it is the first learning algorithm we have found that exceeds our current estimate of IT representation performance. We hope that this benchmark will assist the community in matching the representational performance of visual cortex and will serve as an initial rallying point for further correspondence between representations derived in brains and machines.</p>

<p>For more information and to download code, etc. please visit the project website - http://dicarlolab.mit.edu/neuralbenchmark</p>
},
	author = {Charles F. Cadieu and Ha Hong and Daniel L K Yamins and Nicolas Pinto and Najib J. Majaj and James J. DiCarlo}
}