@article {4632,
	title = {ThreeDWorld: A Platform for Interactive Multi-Modal Physical Simulation},
	journal = {arXiv},
	year = {2020},
	month = {07/2020},
	type = {Preprint},
	abstract = {<p>We introduce ThreeDWorld (TDW), a platform for interactive multi-modal physical simulation. With TDW, users can simulate high-fidelity sensory data and physical interactions between mobile agents and objects in a wide variety of rich 3D environments. TDW has several unique properties: 1) realtime near photo-realistic image rendering quality; 2) a library of objects and environments with materials for high-quality rendering, and routines enabling user customization of the asset library; 3) generative procedures for efficiently building classes of new environments 4) high-fidelity audio rendering; 5) believable and realistic physical interactions for a wide variety of material types, including cloths, liquid, and deformable objects; 6) a range of "avatar" types that serve as embodiments of AI agents, with the option for user avatar customization; and 7) support for human interactions with VR devices. TDW also provides a rich API enabling multiple agents to interact within a simulation and return a range of sensor and physics data representing the state of the world. We present initial experiments enabled by the platform around emerging research directions in computer vision, machine learning, and cognitive science, including multi-modal physical scene understanding, multi-agent interactions, models that "learn like a child", and attention studies in humans and neural networks. The simulation platform will be made publicly available.</p>
},
	url = {https://arxiv.org/abs/2007.04954},
	author = {Chuang Gen and Jeremy Schwartz and Seth Alter and Martin Schrimpf and James Traer and Julian De Freitas and Jonas Kubilius and Abhishek Bhandwaldar and Nick Haber and Megumi Sano and Kuno Kim and Elias Wang and Damian Mrowca and Michael Lingelbach and Aidan Curtis and Kevin Feigleis and Daniel Bear and Dan Gutfreund and David Cox and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins}
}
@article {4633,
	title = {ThreeDWorld (TDW): A High-Fidelity, Multi-Modal Platform for Interactive Physical Simulation},
	year = {2020},
	month = {07/2020},
	abstract = {<h3>TDW is a 3D virtual world simulation platform, utilizing state-of-the-art video game engine technology</h3>

<p>A TDW simulation consists of two components: a) the <strong>Build</strong>, a compiled executable running on the Unity3D Engine, which is responsible for image rendering, audio synthesis and physics simulations; and b) the <strong>Controller</strong>, an external Python interface to communicate with the build.</p>

<p>Researchers write Controllers that send commands to the Build, which executes those commands and returns a broad range of data types representing the state of the virtual world.</p>

<p>TDW provides researchers with:</p>

<ul>
	<li>A general, flexible design that does not impose constraints on the types of use-cases it can support, nor force any particular metaphor on the user.</li>
	<li>Support for multiple modalities -- visual rendering with near-photoreal image quality, coupled with superior audio rendering fidelity.</li>
	<li>A comprehensive, highly extensible and thoroughly documented command and control Python API.</li>
	<li>Multiple paradigms for object interaction, capable of generating physically-realistic behavior.</li>
</ul>

<p>TDW is being used on a daily basis in multiple labs, supporting research that sits at the nexus of neuroscience, cognitive science and artificial intelligence.</p>

<p>Find out more about ThreeDWorld on the project weobsite using the link below.</p>
},
	url = {http://www.threedworld.org/},
	author = {Jeremy Schwartz and Seth Alter and James J. DiCarlo and Josh H. McDermott and Joshua B. Tenenbaum and Daniel L K Yamins and Dan Gutfreund and Chuang Gan and James Traer and Jonas Kubilius and Martin Schrimpf and Abhishek Bhandwaldar and Julian De Freitas and Damian Mrowca and Michael Lingelbach and Megumi Sano and Daniel Bear and Kuno Kim and Nick Haber and Chaofei Fan}
}
@proceedings {4379,
	title = {Brain-Like Object Recognition with High-Performing Shallow Recurrent ANNs},
	year = {2019},
	month = {10/2019},
	address = {Vancouver, Canada},
	abstract = {<p>Deep convolutional artificial neural networks (ANNs) are the leading class of candidate models of the mechanisms of visual processing in the primate ventral stream. While initially inspired by brain anatomy, over the past years, these ANNs have evolved from a simple eight-layer architecture in AlexNet to extremely deep and branching architectures, demonstrating increasingly better object categorization performance, yet bringing into question how brain-like they still are. In particular, typical deep models from the machine learning community are often hard to map onto the brain{\textquoteright}s anatomy due to their vast number of layers and missing biologically-important connections, such as recurrence. Here we demonstrate that better anatomical alignment to the brain and high performance on machine learning as well as neuroscience measures do not have to be in contradiction. We developed CORnet-S, a shallow ANN with four anatomically mapped areas and recurrent connectivity, guided by Brain-Score, a new large-scale composite of neural and behavioral benchmarks for quantifying the functional fidelity of models of the primate ventral visual stream. Despite being significantly shallower than most models, CORnet-S is the top model on Brain-Score and outperforms similarly compact models on ImageNet. Moreover, our extensive analyses of CORnet-S circuitry variants reveal that recurrence is the main predictive factor of both Brain- Score and ImageNet top-1 performance. Finally, we report that the temporal evolution of the CORnet-S "IT" neural population resembles the actual monkey IT population dynamics. Taken together, these results establish CORnet-S, a compact, recurrent ANN, as the current best model of the primate ventral visual stream.</p>
},
	author = {Jonas Kubilius and Martin Schrimpf and Kohitij Kar and Rishi Rajalingham and Ha Hong and Najib J. Majaj and Elias B. Issa and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Aran Nayebi and Daniel Bear and Daniel L K Yamins and James J. DiCarlo}
}
@article {4141,
	title = {Evidence that recurrent circuits are critical to the ventral stream{\textquoteright}s execution of core object recognition behavior},
	journal = {Nature Neuroscience},
	year = {2019},
	month = {04/2019},
	abstract = {<p>Non-recurrent deep convolutional neural networks (DCNNs) are currently the best models of core object recognition; a behavior supported by the densely recurrent primate ventral stream, culminating in the inferior temporal (IT) cortex. Are these recurrent circuits critical to the ventral stream{\textquoteright}s execution of this behavior? We reasoned that, if recurrence is critical, then primates should outperform feedforward-only DCNNs for some images, and that these images should require additional processing time beyond the feedforward IT response. Here we first used behavioral methods to discover hundreds of these {\textquotedblleft}challenge{\textquotedblright} images. Second, using large- scale IT electrophysiology in animals performing core recognition tasks, we observed that behaviorally-sufficient, linearly-decodable object identity solutions emerged ~30ms (on average) later in IT for challenge images compared to DCNN and primate performance-matched {\textquotedblleft}control{\textquotedblright} images. We observed these same late solutions even during passive viewing. Third, consistent with a failure of feedforward computations, the behaviorally-critical late-phase IT population response patterns evoked by the challenge images were poorly predicted by DCNN activations. Interestingly, very deep CNNs as well as not-so-deep but recurrent CNNs better predicted these late IT responses, suggesting a functional equivalence between additional nonlinear transformations and recurrence. Our results argue that automatically-evoked recurrent circuits are critical even for rapid object identification. By precisely comparing current DCNNs, primate behavior and IT population dynamics, we provide guidance for future recurrent model development.</p>
},
	doi = {10.1038/s41593-019-0392-5},
	url = {https://www.nature.com/articles/s41593-019-0392-5},
	author = {Kohitij Kar and Jonas Kubilius and Kailyn Schmidt and Elias B. Issa and James J. DiCarlo}
}
@article {4294,
	title = {Brain-Score: Which Artificial Neural Network for Object Recognition is most Brain-Like?},
	journal = {bioRxiv preprint},
	year = {2018},
	abstract = {<p>The internal representations of early deep artificial neural networks (ANNs) were found to be remarkably similar to the internal neural representations measured experimentally in the primate brain. Here we ask, as deep ANNs have continued to evolve, are they becoming more or less brain-like? ANNs that are most functionally similar to the brain will contain mechanisms that are most like those used by the brain. We therefore developed <em>Brain-Score</em> {\textendash} a composite of multiple neural and behavioral benchmarks that score any ANN on how similar it is to the brain{\textquoteright}s mechanisms for core object recognition {\textendash} and we deployed it to evaluate a wide range of state-of-the-art deep ANNs. Using this scoring system, we here report that: (1) DenseNet-169, CORnet-S and ResNet-101 are the most brain-like ANNs. There remains considerable variability in neural and behavioral responses that is not predicted by any ANN, suggesting that no ANN model has yet captured all the relevant mechanisms. (3) Extending prior work, we found that gains in ANN ImageNet performance led to gains on Brain-Score. However, correlation weakened at <em>>=</em> 70\% top-1 ImageNet performance, suggesting that additional guidance from neuroscience is needed to make further advances in capturing brain mechanisms. (4) We uncovered smaller (i.e. less complex) ANNs that are more brain-like than many of the best-performing ImageNet models, which suggests the opportunity to simplify ANNs to better understand the ventral stream. The scoring system used here is far from complete. However, we propose that evaluating and tracking model-benchmark correspondences through a Brain-Score that is regularly updated with new brain data is an exciting opportunity: experimental benchmarks can be used to guide machine network evolution, and machine networks are mechanistic hypotheses of the brain{\textquoteright}s network and thus drive next experiments. To facilitate both of these, we release Brain-Score.org: a platform that hosts the neural and behavioral benchmarks, where ANNs for visual processing can be submitted to receive a Brain-Score and their rank relative to other models, and where new experimental data can be naturally incorporated.</p>
},
	keywords = {computational neuroscience, deep learning, Neural Networks, object recognition, ventral stream},
	doi = {10.1101/407007},
	url = {https://www.biorxiv.org/content/10.1101/407007v1},
	author = {Martin Schrimpf and Jonas Kubilius},
	editor = {Ha Hong and Najib J. Majaj and Rishi Rajalingham and Elias B. Issa and Kohitij Kar and Pouya Bashivan and Jonathan Prescott-Roy and Kailyn Schmidt and Daniel L K Yamins and James J. DiCarlo}
}
@article {2571,
	title = {The occipital place area represents the local elements of scenes},
	journal = {NeuroImage},
	volume = {132},
	year = {2016},
	month = {02/2016},
	pages = {417 - 424},
	abstract = {<p>Neuroimaging studies have identified three scene-selective regions in human cortex: parahippocampal place area (PPA), retrosplenial complex (RSC), and occipital place area (OPA). However, precisely what scene information each region represents is not clear, especially for the least studied, more posterior OPA. Here we hypothesized that OPA represents local elements of scenes within two independent, yet complementary scene descriptors: spatial boundary (i.e., the layout of external surfaces) and scene content (e.g., internal objects). If OPA processes the local elements of spatial boundary information, then it should respond to these local elements (e.g., walls) themselves, regardless of their spatial arrangement. Indeed, we found that OPA, but not PPA or RSC, responded similarly to images of intact rooms and these same rooms in which the surfaces were fractured and rearranged, disrupting the spatial boundary. Next, if OPA represents the local elements of scene content information, then it should respond more when more such local elements (e.g., furniture) are present. Indeed, we found that OPA, but not PPA or RSC, responded more to multiple than single pieces of furniture. Taken together, these findings reveal that OPA analyzes local scene elements - both in spatial boundary and scene content representation - while PPA and RSC represent global scene properties.</p>
},
	issn = {10538119},
	doi = {10.1016/j.neuroimage.2016.02.062},
	url = {https://www.ncbi.nlm.nih.gov/pubmed/26931815},
	author = {Kamps, Frederik S. and Julian, Joshua B. and Jonas Kubilius and Nancy Kanwisher and Dilks, Daniel D.}
}