@article {5009,
	title = {Image interpretation by iterative bottom-up top- down processing},
	number = {120},
	year = {2021},
	month = {11/2021},
	abstract = {<table>
	<tbody>
		<tr>
			<td>
			<div>
			<div>
			<p>Scene understanding requires the extraction and representation of scene components, such as objects and their parts, people, and places, together with their individual properties, as well as relations and interactions between them. We describe a model in which meaningful scene structures are extracted from the image by an iterative process, combining bottom-up (BU) and top-down (TD) networks, interacting through a symmetric bi-directional communication between them ({\textquoteleft}counter-streams{\textquoteright} structure). The BU- TD model extracts and recognizes scene constituents with their selected properties and relations, and uses them to describe and understand the image.</p>

			<p>The scene representation is constructed by the iterative use of three components. The first model component is a bottom-up stream that extracts selected scene elements, properties and relations. The second component ({\textquoteleft}cognitive augmentation{\textquoteright}) augments the extracted visual representation based on relevant non-visual stored representations. It also provides input to the third component, the top-down stream, in the form of a TD instruction, instructing the model what task to perform next. The top-down stream then guides the BU visual stream to perform the selected task in the next cycle. During this</p>
			</div>
			</div>
			</td>
		</tr>
		<tr>
			<td>
			<div>
			<div>
			<p>process, the visual representations extracted from the image can be combined with relevant non- visual representations, so that the final scene representation is based on both visual information extracted from the scene and relevant stored knowledge of the world.<br />
			We show how the BU-TD model composes complex visual tasks from sequences of steps, invoked by individual TD instructions. In particular, we describe how a sequence of TD-instructions is used to extract from the scene structures of interest, including an algorithm to automatically select the next TD- instruction in the sequence. The selection of TD instruction depends in general on the goal, the image, and on information already extracted from the image in previous steps. The TD-instructions sequence is therefore not a fixed sequence determined at the start, but an evolving program (or {\textquoteleft}visual routine{\textquoteright}) that depends on the goal and the image.</p>

			<p>The extraction process is shown to have favourable properties in terms of combinatorial generalization,</p>
			</div>
			</div>
			</td>
		</tr>
		<tr>
			<td>
			<div>
			<div>
			<p>generalizing well to novel scene structures and new combinations of objects, properties and relations not seen during training. Finally, we compare the model with relevant aspects of the human vision, and suggest directions for using the BU-TD scheme for integrating visual and cognitive components in the process of scene understanding.</p>
			</div>
			</div>
			</td>
		</tr>
	</tbody>
</table>

<div>\&nbsp;</div>
},
	author = {Shimon Ullman and Liav Assif and Alona Strugatski and Ben-Zion Vatashsky and Hila Levi and Aviv Netanyahu and Adam Uri Yaari}
}
@conference {4830,
	title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception},
	booktitle = {AAAI-21},
	year = {2021},
	abstract = {<p>The ability to perceive and reason about social interactions in the context of physical environments is core to human social intelligence and human-machine cooperation. However, no prior dataset or benchmark has systematically evaluated physically grounded perception of complex social interactions that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide range of real-life social interactions by including social concepts such as helping another agent. PHASE consists of 2D animations of pairs of agents moving in a continuous space generated procedurally using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating that humans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASE can serve as a difficult new challenge for developing new models that can recognize complex social interactions.</p>
},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}
@article {5058,
	title = {PHASE: PHysically-grounded Abstract Social Events for Machine Social Perception},
	number = {123},
	year = {2021},
	month = {03/2021},
	abstract = {<p>The ability to perceive and reason about social interactions in the context of physical environments<br />
is core to human social intelligence and human-machine cooperation. However, no prior dataset or<br />
benchmark has systematically evaluated physically grounded perception of complex social interactions<br />
that go beyond short actions, such as high-fiving, or simple group activities, such as gathering. In this<br />
work, we create a dataset of physically-grounded abstract social events, PHASE, that resemble a wide<br />
range of real-life social interactions by including social concepts such as helping another agent. PHASE<br />
consists of 2D animations of pairs of agents moving in a continuous space generated procedurally<br />
using a physics engine and a hierarchical planner. Agents have a limited field of view, and can interact<br />
with multiple objects, in an environment that has multiple landmarks and obstacles. Using PHASE,<br />
we design a social recognition task and a social prediction task. PHASE is validated with human<br />
experiments demonstrating that humans perceive rich interactions in the social events, and that the<br />
simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse<br />
planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-<br />
the-art feedforward neural networks. We hope that PHASE can serve as a difficult new challenge for<br />
developing new models that can recognize complex social interactions.</p>
},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}
@conference {4700,
	title = {PHASE: PHysically-grounded Abstract Social Eventsfor Machine Social Perception},
	booktitle = {Shared Visual Representations in Human and Machine Intelligence (SVRHM) workshop at NeurIPS 2020},
	year = {2020},
	month = {12/2020},
	abstract = {<p>The ability to perceive and reason about social interactions in the context ofphysical environments is core to human social intelligence and human-machinecooperation. However, no prior dataset or benchmark has systematically evaluatedphysically grounded perception of complex social interactions that go beyondshort actions, such as high-fiving, or simple group activities, such as gathering.In this work, we create a dataset of physically-grounded abstract social events,PHASE, that resemble a wide range of real-life social interactions by includingsocial concepts such as helping another agent. PHASE consists of 2D animationsof pairs of agents moving in a continuous space generated procedurally using aphysics engine and a hierarchical planner. Agents have a limited field of view, andcan interact with multiple objects, in an environment that has multiple landmarksand obstacles. Using PHASE, we design a social recognition task and a social prediction task. PHASE is validated with human experiments demonstrating thathumans perceive rich interactions in the social events, and that the simulated agents behave similarly to humans. As a baseline model, we introduce a Bayesian inverse planning approach, SIMPLE (SIMulation, Planning and Local Estimation), which outperforms state-of-the-art feed-forward neural networks. We hope that PHASEcan serve as a difficult new challenge for developing new models that can recognize complex social interactions.</p>
},
	url = {https://openreview.net/forum?id=_bokm801zhx},
	author = {Aviv Netanyahu and Tianmin Shu and Boris Katz and Andrei Barbu and Joshua B. Tenenbaum}
}