[ { "id": "-3G6_D66Aua", "title": "Simultaneous Learning of Contact and Continuous Dynamics", "track": "main", "status": "Poster", "tldr": "We simultaneously learn contact and continuous dynamics of novel objects through contact-rich trajectories, using model-based structure and residual physics.", "abstract": "Robotic manipulation can greatly benefit from the data efficiency, robustness, and predictability of model-based methods if robots can quickly generate models of novel objects they encounter. This is especially difficult when effects like complex joint friction lack clear first-principles models and are usually ignored by physics simulators. Further, numerically-stiff contact dynamics can make common model-building approaches struggle. We propose a method to simultaneously learn contact and continuous dynamics of a novel, possibly multi-link object by observing its motion through contact-rich trajectories. We formulate a system identification process with a loss that infers unmeasured contact forces, penalizing their violation of physical constraints and laws of motion given current model parameters. Our loss is unlike prediction-based losses used in differentiable simulation. Using a new dataset of real articulated object trajectories and an existing cube toss dataset, our method outperforms differentiable simulation and end-to-end alternatives with more data efficiency. See our project page for code, datasets, and media: https://sites.google.com/view/continuous-contact-nets/home", "keywords": "system identification;dynamics learning;contact-rich manipulation", "primary_area": "", "supplementary_material": "/attachment/1d9434f8d8f315ff27c63f53f765b688bff8c595.zip", "author": "Bibit Bianchini;Mathew Halm;Michael Posa", "authorids": "~Bibit_Bianchini1;~Mathew_Halm1;~Michael_Posa1", "gender": ";M;M", "homepage": "http://www.bianchini-love.com/bibit;https://matthalm.net/;https://dair.seas.upenn.edu/", "dblp": ";;129/2382", "google_scholar": "lVj0WaQAAAAJ;TXz7rb8AAAAJ;DCSFMuAAAAAJ", "orcid": ";;", "linkedin": "ebianchini/;;", "or_profile": "~Bibit_Bianchini1;~Mathew_Halm1;~Michael_Posa1", "aff": "University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;seas.upenn.edu;upenn.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbianchini2023simultaneous,\ntitle={Simultaneous Learning of Contact and Continuous Dynamics},\nauthor={Bibit Bianchini and Mathew Halm and Michael Posa},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=-3G6_D66Aua}\n}", "github": "https://github.com/ebianchi/dair_pll", "project": "", "reviewers": "TJk2;w1Bc;XnXf;h2vE", "site": "https://openreview.net/forum?id=-3G6_D66Aua", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;3;2", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4561386072969302676&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "-HFJuX1uqs", "title": "Act3D: 3D Feature Field Transformers for Multi-Task Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "We introduce Act3D, a manipulation policy transformer that represents the robot\u2019s workspace using a 3D feature field with adaptive resolutions.", "abstract": "3D perceptual representations are well suited for robot manipulation as they easily encode occlusions and simplify spatial reasoning. Many manipulation tasks require high spatial precision in end-effector pose prediction, which typically demands high-resolution 3D feature grids that are computationally expensive to process. As a result, most manipulation policies operate directly in 2D, foregoing 3D inductive biases. In this paper, we introduce Act3D, a manipulation policy transformer that represents the robot\u2019s workspace using a 3D feature field with adaptive resolutions dependent on the task at hand. The model lifts 2D pre-trained features to 3D using sensed depth, and attends to them to compute features for sampled 3D points. It samples 3D point grids in a coarse to fine manner, featurizes them using relative-position attention, and selects where to focus the next round of point sampling. In this way, it efficiently computes 3D action maps of high spatial resolution. Act3D sets a new state-of-the-art in RLBench, an established manipulation benchmark, where it achieves 10% absolute improvement over the previous SOTA 2D multi-view policy on 74 RLBench tasks and 22% absolute improvement with 3x less compute over the previous SOTA 3D policy. We quantify the importance of relative spatial attention, large-scale vision-language pre-trained 2D backbones, and weight tying across coarse-to-fine attentions in ablative experiments.", "keywords": "Learning from Demonstrations;Manipulation;Transformers", "primary_area": "", "supplementary_material": "/attachment/4a12c9f05b9118fe4f9d6f409eefa83f3d7cf278.zip", "author": "Theophile Gervet;Zhou Xian;Nikolaos Gkanatsios;Katerina Fragkiadaki", "authorids": "~Theophile_Gervet1;~Zhou_Xian1;~Nikolaos_Gkanatsios1;~Katerina_Fragkiadaki1", "gender": "M;M;M;F", "homepage": "https://theophilegervet.github.io;;https://nickgkan.github.io/;https://www.cs.cmu.edu/~katef/", "dblp": ";258/5020;225/5677;21/8780", "google_scholar": "-o8kQPwAAAAJ;;https://scholar.google.gr/citations?user=jk7GqOEAAAAJ;FWp7728AAAAJ", "orcid": ";;;", "linkedin": "theophile-gervet/;;;", "or_profile": "~Theophile_Gervet1;~Zhou_Xian1;~Nikolaos_Gkanatsios1;~Katerina_Fragkiadaki1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;Graduate student;Assistant Professor", "bibtex": "@inproceedings{\ngervet2023actd,\ntitle={Act3D: 3D Feature Field Transformers for Multi-Task Robotic Manipulation},\nauthor={Theophile Gervet and Zhou Xian and Nikolaos Gkanatsios and Katerina Fragkiadaki},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=-HFJuX1uqs}\n}", "github": "https://github.com/zhouxian/chained-diffuser", "project": "", "reviewers": "C6Eb;hSNU;4R1b;fEwK", "site": "https://openreview.net/forum?id=-HFJuX1uqs", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;3;4", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 71, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13294023682227072291&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "-K7-1WvKO3F", "title": "ViNT: A Foundation Model for Visual Navigation", "track": "main", "status": "Oral", "tldr": "Large Transformer-based backbone for navigation, trained with cross-embodiment datasets, that supports broad generalization and efficient adaptation to downstream tasks.", "abstract": "General-purpose pre-trained models (``foundation models'') have enabled practitioners to produce generalizable solutions for individual machine learning problems with datasets that are significantly smaller than those required for learning from scratch. Such models are typically trained on large and diverse datasets with weak supervision, consuming much more training data than is available for any individual downstream application. In this paper, we describe the Visual Navigation Transformer (ViNT), a foundation model that aims to bring the success of general-purpose pre-trained models to vision-based robotic navigation. ViNT is trained with a general goal-reaching objective that can be used with any navigation dataset, and employs a flexible Transformer-based architecture to learn navigational affordances and enable efficient adaptation to a variety of downstream navigational tasks. ViNT is trained on a number of existing navigation datasets, comprising hundreds of hours of robotic navigation from a variety of different robotic platforms, and exhibits positive transfer, outperforming specialist models trained on narrower datasets. ViNT can be augmented with diffusion-based goal proposals to explore novel environments, and can solve kilometer-scale navigation problems when equipped with long-range heuristics. ViNT can also be adapted to novel task specifications with a technique inspired by prompt-tuning, where the goal encoder is replaced by an encoding of another task modality (e.g., GPS waypoints or turn-by-turn directions) embedded into the same space of goal tokens. This flexibility and ability to accommodate a variety of downstream problem domains establish ViNT as an effective foundation model for mobile robotics.", "keywords": "visual navigation;multi-task learning;planning;generalization", "primary_area": "", "supplementary_material": "/attachment/06bf1b5cf5c4659b2a7d042947f11a2cd443017e.zip", "author": "Dhruv Shah;Ajay Sridhar;Nitish Dashora;Kyle Stachowicz;Kevin Black;Noriaki Hirose;Sergey Levine", "authorids": "~Dhruv_Shah1;~Ajay_Sridhar1;~Nitish_Dashora1;~Kyle_Stachowicz1;~Kevin_Black2;~Noriaki_Hirose1;~Sergey_Levine1", "gender": "M;M;M;M;;M;M", "homepage": "http://cs.berkeley.edu/~shah;https://ajaysridhar.com;https://www.nitishdashora.com;https://kylesta.ch;https://kevin.black;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";;;;66/9687;126/5605;80/7594", "google_scholar": ";https://scholar.google.com/citations?hl=en;;;axX7PCwAAAAJ;https://scholar.google.co.jp/citations?user=xvOlfw8AAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;;", "linkedin": ";;dashora7/;;;;", "or_profile": "~Dhruv_Shah1;~Ajay_Sridhar1;~Nitish_Dashora1;~Kyle_Stachowicz1;~Kevin_Black2;~Noriaki_Hirose1;~Sergey_Levine1", "aff": "UC Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Toyota Central R&D Labs., Inc;Google", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;mosk.tytlabs.co.jp;google.com", "position": "PhD student;Undergrad student;Undergrad student;PhD student;PhD student;Researcher;Research Scientist", "bibtex": "@inproceedings{\nshah2023vint,\ntitle={Vi{NT}: A Foundation Model for Visual Navigation},\nauthor={Dhruv Shah and Ajay Sridhar and Nitish Dashora and Kyle Stachowicz and Kevin Black and Noriaki Hirose and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=-K7-1WvKO3F}\n}", "github": "https://github.com/robodhruv/visualnav-transformer", "project": "", "reviewers": "LWck;EDM6;ikk2;Q7NJ", "site": "https://openreview.net/forum?id=-K7-1WvKO3F", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "3;4;4;3", "rating_avg": 6.5, "confidence_avg": 3.5, "replies_avg": 14, "authors#_avg": 7, "corr_rating_confidence": -0.2294157338705618, "gs_citation": 154, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7196741364817045136&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1;2", "aff_unique_norm": "University of California, Berkeley;Toyota Central R&D Labs., Inc;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.toyota-global.com;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Toyota R&D;Google", "aff_campus_unique_index": "0;0;0;0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "United States;Japan" }, { "id": "09UL1dCqf2n", "title": "Preference learning for guiding the tree search in continuous POMDPs", "track": "main", "status": "Poster", "tldr": "", "abstract": "A robot operating in a partially observable environment must perform sensing actions to achieve a goal, such as clearing the objects in front of a shelf to better localize a target object at the back, and estimate its shape for grasping. A POMDP is a principled framework for enabling robots to perform such information-gathering actions. Unfortunately, while robot manipulation domains involve high-dimensional and continuous observation and action spaces, most POMDP solvers are limited to discrete spaces. Recently, POMCPOW has been proposed for continuous POMDPs, which handles continuity using sampling and progressive widening. However, for robot manipulation problems involving camera observations and multiple objects, POMCPOW is too slow to be practical. We take inspiration from the recent work in learning to guide task and motion planning to propose a framework that learns to guide POMCPOW from past planning experience. Our method uses preference learning that utilizes both success and failure trajectories, where the preference label is given by the results of the tree search. We demonstrate the efficacy of our framework in several continuous partially observable robotics domains, including real-world manipulation, where our framework explicitly reasons about the uncertainty in off-the-shelf segmentation and pose estimation algorithms.", "keywords": "POMDP;Online planning;Guided Search;Preference-based learning", "primary_area": "", "supplementary_material": "/attachment/a79e93094cc92c8c896d1d62467aa0e798aa8012.zip", "author": "Jiyong Ahn;Sanghyeon Son;Dongryung Lee;Jisu Han;Dongwon Son;Beomjoon Kim", "authorids": "~Jiyong_Ahn1;~Sanghyeon_Son1;~Dongryung_Lee1;~Jisu_Han1;~Dongwon_Son1;~Beomjoon_Kim2", "gender": ";;M;;M;M", "homepage": "https://imsquared.github.io/;;https://github.com/dlee960504;;https://dongwon-son.github.io/;https://beomjoonkim.github.io/", "dblp": ";;;;226/6343;88/1505", "google_scholar": ";;;;https://scholar.google.co.kr/citations?user=oaUQsWgAAAAJ;https://scholar.google.ca/citations?user=dw3rEwgAAAAJ", "orcid": ";;;;0000-0003-1446-8125;", "linkedin": ";;;www.linkedin.com/in/jisu-han-;;", "or_profile": "~Jiyong_Ahn1;~Sanghyeon_Son1;~Dongryung_Lee1;~Jisu_Han1;~Dongwon_Son1;~Beomjoon_Kim2", "aff": "Korea Advanced Institute of Science & Technology;;Korea Advanced Institute of Science & Technology;Korea Advanced Institute of Science & Technology;KAIST;Korea Advanced Institute of Science & Technology", "aff_domain": "kaist.edu;;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr;kaist.ac.kr", "position": "MS student;;MS student;MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nahn2023preference,\ntitle={Preference learning for guiding the tree search in continuous {POMDP}s},\nauthor={Jiyong Ahn and Sanghyeon Son and Dongryung Lee and Jisu Han and Dongwon Son and Beomjoon Kim},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=09UL1dCqf2n}\n}", "github": "", "project": "", "reviewers": "dipf;kYWh;8hjC;TVga", "site": "https://openreview.net/forum?id=09UL1dCqf2n", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;3;5", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": 0.8703882797784891, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:ffkiOfOUxQwJ:scholar.google.com/&scioq=Preference+learning+for+guiding+the+tree+search+in+continuous+POMDPs&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Korea Advanced Institute of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kaist.ac.kr", "aff_unique_abbr": "KAIST", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "0I3su3mkuL", "title": "Q-Transformer: Scalable Offline Reinforcement Learning via Autoregressive Q-Functions", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this work, we present a scalable reinforcement learning method for training multi-task policies from large offline datasets that can leverage both human demonstrations and autonomously collected data. Our method uses a Transformer to provide a scalable representation for Q-functions trained via offline temporal difference backups. We therefore refer to the method as Q-Transformer. By discretizing each action dimension and representing the Q-value of each action dimension as separate tokens, we can apply effective high-capacity sequence modeling techniques for Q-learning. We present several design decisions that enable good performance with offline RL training, and show that Q-Transformer outperforms prior offline RL algorithms and imitation learning techniques on a large diverse real-world robotic manipulation task suite.", "keywords": "Reinforcement Learning;Offline RL;Transformers;Q-Learning;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/12afd39b441718d351830a962417593467551140.zip", "author": "Yevgen Chebotar;Quan Vuong;Karol Hausman;Fei Xia;Yao Lu;Alex Irpan;Aviral Kumar;Tianhe Yu;Alexander Herzog;Karl Pertsch;Keerthana Gopalakrishnan;Julian Ibarz;Ofir Nachum;Sumedh Anand Sontakke;Grecia Salazar;Huong T Tran;Jodilyn Peralta;Clayton Tan;Deeksha Manjunath;Jaspiar Singh;Brianna Zitkovich;Tomas Jackson;Kanishka Rao;Chelsea Finn;Sergey Levine", "authorids": "~Yevgen_Chebotar1;~Quan_Vuong2;~Karol_Hausman2;~Fei_Xia1;~Yao_Lu13;~Alex_Irpan1;~Aviral_Kumar2;~Tianhe_Yu1;~Alexander_Herzog2;~Karl_Pertsch1;~Keerthana_Gopalakrishnan1;~Julian_Ibarz2;~Ofir_Nachum1;~Sumedh_Anand_Sontakke1;grecias@google.com;huongtt@google.com;jodilyn@google.com;claytontan@google.com;deemd@google.com;jaspiar@google.com;zitkovich@google.com;tomasjackson@google.com;~Kanishka_Rao1;~Chelsea_Finn1;~Sergey_Levine1", "gender": "M;M;;M;;M;M;M;M;;F;M;M;M;;;;;;;;;;F;M", "homepage": ";https://quanvuong.github.io;;;;http://www.alexirpan.com;https://aviralkumar2907.github.io/;https://cs.stanford.edu/~tianheyu/;;https://kpertsch.github.io/;https://keerthanapg.com;https://www.julianibarz.com;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;https://sumedh7.github.io/;;;;;;;;;https://research.google/people/KanishkaRao/;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "01/11424;;;;26/5662-6;202/2063;202/7961;192/1797;;211/7137;;66/10509;;276/0127;;;;;;;;;;131/1783;80/7594", "google_scholar": "ADkiClQAAAAJ;NSWI3OwAAAAJ;;pqP5_PgAAAAJ;OI7zFmwAAAAJ;;;;jrfFYAIAAAAJ;https://scholar.google.com/citations?view_op=list_works;;l-la0GQAAAAJ;C-ZlBWMAAAAJ;https://scholar.google.com/citations?hl=en;;;;;;;;;;vfPE6hgAAAAJ;8R35rCwAAAAJ", "orcid": ";;;0000-0003-4343-1444;;;;;;;;;;;;;;;;;;;;;", "linkedin": ";;;;;;;;alexander-herzog-154030a5/;;;;;sumedh-sontakke-0ab24210a/;;;;;;;;;;;", "or_profile": "~Yevgen_Chebotar1;~Quan_Vuong2;~Karol_Hausman2;~Fei_Xia1;~Yao_Lu13;~Alex_Irpan1;~Aviral_Kumar2;~Tianhe_Yu1;~Alexander_Herzog2;~Karl_Pertsch1;~Keerthana_Gopalakrishnan1;~Julian_Ibarz2;~Ofir_Nachum1;~Sumedh_Anand_Sontakke1;grecias@google.com;huongtt@google.com;jodilyn@google.com;claytontan@google.com;deemd@google.com;jaspiar@google.com;zitkovich@google.com;tomasjackson@google.com;~Kanishka_Rao1;~Chelsea_Finn1;~Sergey_Levine1", "aff": "Google;;;Google;Google;Google DeepMind;University of California, Berkeley;Google Brain;Google;University of Southern California;Research, Google;Google;OpenAI;University of Southern California;;;;;;;;;;Google;Google", "aff_domain": "google.com;;;google.com;google.com;google.com;berkeley.edu;google.com;google.com;usc.edu;research.google.com;google.com;openai.com;usc.edu;;;;;;;;;;google.com;google.com", "position": "Research Scientist;;;Researcher;Researcher;Researcher;PhD student;Research Scientist;Researcher;PhD student;Researcher;Senior Staff Software Engineer;Researcher;PhD student;;;;;;;;;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nchebotar2023qtransformer,\ntitle={Q-Transformer: Scalable Offline Reinforcement Learning via Autoregressive Q-Functions},\nauthor={Yevgen Chebotar and Quan Vuong and Karol Hausman and Fei Xia and Yao Lu and Alex Irpan and Aviral Kumar and Tianhe Yu and Alexander Herzog and Karl Pertsch and Keerthana Gopalakrishnan and Julian Ibarz and Ofir Nachum and Sumedh Anand Sontakke and Grecia Salazar and Huong T Tran and Jodilyn Peralta and Clayton Tan and Deeksha Manjunath and Jaspiar Singh and Brianna Zitkovich and Tomas Jackson and Kanishka Rao and Chelsea Finn and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0I3su3mkuL}\n}", "github": "", "project": "", "reviewers": "VHvc;5C59;SvFm;9Xry", "site": "https://openreview.net/forum?id=0I3su3mkuL", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;5;4;4", "rating_avg": 7.0, "confidence_avg": 4.25, "replies_avg": 23, "authors#_avg": 25, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4245565392288907120&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;1;0;0;2;0;0;3;2;0;0", "aff_unique_norm": "Google;University of California, Berkeley;University of Southern California;OpenAI", "aff_unique_dep": "Google;;;", "aff_unique_url": "https://www.google.com;https://www.berkeley.edu;https://www.usc.edu;https://openai.com", "aff_unique_abbr": "Google;UC Berkeley;USC;OpenAI", "aff_campus_unique_index": "0;0;0;2;0;0;3;0;0;3;0;0", "aff_campus_unique": "Mountain View;;Berkeley;Los Angeles", "aff_country_unique_index": "0;0;0;1;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "0bZaUfELuW", "title": "Goal Representations for Instruction Following: A Semi-Supervised Language Interface to Control", "track": "main", "status": "Poster", "tldr": "We train language-conditioned policies in a semi-supervised manner by aligning representations between goal-conditioned and language-conditioned tasks with a contrastive objective.", "abstract": "Our goal is for robots to follow natural language instructions like ``put the towel next to the microwave.'' But getting large amounts of labeled data, i.e. data that contains demonstrations of tasks labeled with the language instruction, is prohibitive. In contrast, obtaining policies that respond to image goals is much easier, because any autonomous trial or demonstration can be labeled in hindsight with its final state as the goal. In this work, we contribute a method that taps into joint image- and goal- conditioned policies with language using only a small amount of language data. Prior work has made progress on this using vision-language models or by jointly training language-goal-conditioned policies, but so far neither method has scaled effectively to real-world robot tasks without significant human annotation. Our method achieves robust performance in the real world by learning an embedding from the labeled data that aligns language not to the goal image, but rather to the desired change between the start and goal images that the instruction corresponds to. We then train a policy on this embedding: the policy benefits from all the unlabeled data, but the aligned embedding provides an *interface* for language to steer the policy. We show instruction following across a variety of manipulation tasks in different scenes, with generalization to language instructions outside of the labeled data.", "keywords": "Instruction Following;Representation Learning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/84fa12384daf224123f22470e789d3fc0257820d.zip", "author": "Vivek Myers;Andre Wang He;Kuan Fang;Homer Rich Walke;Philippe Hansen-Estruch;Ching-An Cheng;Mihai Jalobeanu;Andrey Kolobov;Anca Dragan;Sergey Levine", "authorids": "~Vivek_Myers1;~Andre_Wang_He1;~Kuan_Fang3;~Homer_Rich_Walke1;~Philippe_Hansen-Estruch1;~Ching-An_Cheng1;~Mihai_Jalobeanu1;~Andrey_Kolobov1;~Anca_Dragan1;~Sergey_Levine1", "gender": ";M;;M;;M;;M;F;M", "homepage": "https://people.eecs.berkeley.edu/~vmyers/;;;https://homerwalke.com;;http://www.chinganc.com;http://mihaij.com/;https://www.microsoft.com/en-us/research/people/akolobov/;http://www.ancadragan.com/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "270/8694;318/3206.html;;279/6795;;123/6369;;95/3462;;80/7594", "google_scholar": "5NGAbT4AAAAJ;;;ZWH5jCwAAAAJ;;bMZFLZ_V4goC;;xEWgxBsAAAAJ;;8R35rCwAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";andre-he-08778219a/;;;;;;;;", "or_profile": "~Vivek_Myers1;~Andre_Wang_He1;~Kuan_Fang3;~Homer_Rich_Walke1;~Philippe_Hansen-Estruch1;~Ching-An_Cheng1;~Mihai_Jalobeanu1;~Andrey_Kolobov1;~Anca_Dragan1;~Sergey_Levine1", "aff": "University of California, Berkeley;UC Berkeley, University of California, Berkeley;;University of California, Berkeley;;Microsoft Research;Microsoft Research;Microsoft;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;cs.berkeley.edu;;berkeley.edu;;microsoft.com;research.microsoft.com;microsoft.com;berkeley.edu;google.com", "position": "PhD student;Undergrad student;;PhD student;;Principal Researcher;Software Engineer;Researcher;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nmyers2023goal,\ntitle={Goal Representations for Instruction Following: A Semi-Supervised Language Interface to Control},\nauthor={Vivek Myers and Andre Wang He and Kuan Fang and Homer Rich Walke and Philippe Hansen-Estruch and Ching-An Cheng and Mihai Jalobeanu and Andrey Kolobov and Anca Dragan and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0bZaUfELuW}\n}", "github": "https://github.com/rail-berkeley/grif_release", "project": "", "reviewers": "mB1L;HoyX;z7wQ", "site": "https://openreview.net/forum?id=0bZaUfELuW", "pdf_size": 0, "rating": "4;10;10", "confidence": "5;4;4", "rating_avg": 8.0, "confidence_avg": 4.333333333333333, "replies_avg": 16, "authors#_avg": 10, "corr_rating_confidence": -1.0, "gs_citation": 32, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9646982711872255783&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;1;1;1;0;2", "aff_unique_norm": "University of California, Berkeley;Microsoft;Google", "aff_unique_dep": ";Microsoft Research;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.microsoft.com/en-us/research;https://www.google.com", "aff_unique_abbr": "UC Berkeley;MSR;Google", "aff_campus_unique_index": "0;0;0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "0hPkttoGAf", "title": "RVT: Robotic View Transformer for 3D Object Manipulation", "track": "main", "status": "Oral", "tldr": "", "abstract": "For 3D object manipulation, methods that build an explicit 3D representation perform better than those relying only on camera images. But using explicit 3D representations like voxels comes at large computing cost, adversely affecting scalability. In this work, we propose RVT, a multi-view transformer for 3D manipulation that is both scalable and accurate. Some key features of RVT are an attention mechanism to aggregate information across views and re-rendering of the camera input from virtual views around the robot workspace. In simulations, we find that a single RVT model works well across 18 RLBench tasks with 249 task variations, achieving $26\\%$ higher relative success than the existing state-of-the-art method (PerAct). It also trains 36X faster than PerAct for achieving the same performance and achieves 2.3X the inference speed of PerAct. Further, RVT can perform a variety of manipulation tasks in the real world with just a few ($\\sim$10) demonstrations per task. Visual results, code, and trained model are provided at: https://robotic-view-transformer.github.io/.", "keywords": "3D Manipulation;Multi-View;Transformer", "primary_area": "", "supplementary_material": "/attachment/f34f8849f11ac801f57fda5e0404cb865c27c754.zip", "author": "Ankit Goyal;Jie Xu;Yijie Guo;Valts Blukis;Yu-Wei Chao;Dieter Fox", "authorids": "~Ankit_Goyal1;~Jie_Xu7;~Yijie_Guo1;~Valts_Blukis1;~Yu-Wei_Chao1;~Dieter_Fox1", "gender": "M;M;F;M;M;M", "homepage": "http://imankgoyal.github.io/;https://people.csail.mit.edu/jiex;;;http://www-personal.umich.edu/~ywchao/;https://homes.cs.washington.edu/~fox/", "dblp": "89/10051-1;37/5126-28;;210/9692;44/10700;f/DieterFox", "google_scholar": "RhN6jKIAAAAJ;3Tj5lWEAAAAJ;ONuIPv0AAAAJ;i9-GzNYAAAAJ;48Y9F-YAAAAJ;DqXsbPAAAAAJ", "orcid": ";;;;;", "linkedin": ";;;valtsblukis/;;", "or_profile": "~Ankit_Goyal1;~Jie_Xu7;~Yijie_Guo1;~Valts_Blukis1;~Yu-Wei_Chao1;~Dieter_Fox1", "aff": "NVIDIA;NVIDIA;University of Michigan;NVIDIA;NVIDIA;Department of Computer Science", "aff_domain": "nvidia.com;nvidia.com;umich.edu;nvidia.com;nvidia.com;cs.washington.edu", "position": "Researcher;Researcher;PhD student;Researcher;Research Scientist;Full Professor", "bibtex": "@inproceedings{\ngoyal2023rvt,\ntitle={{RVT}: Robotic View Transformer for 3D Object Manipulation},\nauthor={Ankit Goyal and Jie Xu and Yijie Guo and Valts Blukis and Yu-Wei Chao and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0hPkttoGAf}\n}", "github": "https://github.com/nvlabs/rvt", "project": "", "reviewers": "CGaw;sWvB;E55B", "site": "https://openreview.net/forum?id=0hPkttoGAf", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 140, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17659152417072357287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;0;2", "aff_unique_norm": "NVIDIA;University of Michigan;Unknown Institution", "aff_unique_dep": "NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www.umich.edu;", "aff_unique_abbr": "NVIDIA;UM;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "0hQMcWfjG9", "title": "$\\alpha$-MDF: An Attention-based Multimodal Differentiable Filter for Robot State Estimation", "track": "main", "status": "Poster", "tldr": "This paper introduces an attention-based multimodal differentiable filter framework, which learns multimodal latent representations from data and achieves notable advancements in robot state estimation for both rigid body and soft robot dynamics.", "abstract": "Differentiable Filters are recursive Bayesian estimators that derive the state transition and measurement models from data alone. Their data-driven nature eschews the need for explicit analytical models, while remaining algorithmic components of the filtering process intact. As a result, the gain mechanism -- a critical component of the filtering process -- remains non-differentiable and cannot be adjusted to the specific nature of the task or context. In this paper, we propose an attention-based Multimodal Differentiable Filter ($\\alpha$-MDF) which utilizes modern attention mechanisms to learn multimodal latent representations. Unlike previous differentiable filter frameworks, $\\alpha$-MDF substitutes the traditional gain, e.g., the Kalman gain, with a neural attention mechanism. The approach generates specialized, context-dependent gains that can effectively combine multiple input modalities and observed variables. We validate $\\alpha$-MDF on a diverse set of robot state estimation tasks in real world and simulation. Our results show $\\alpha$-MDF achieves significant reductions in state estimation errors, demonstrating nearly 4-fold improvements compared to state-of-the-art sensor fusion strategies for rigid body robots. Additionally, the $\\alpha$-MDF consistently outperforms differentiable filter baselines by up to 45% in soft robotics tasks. The project is available at alpha-mdf.github.io and the codebase is at github.com/ir-lab/alpha-MDF", "keywords": "Differentiable Filters;Sensor Fusion;Multimodal Learning", "primary_area": "", "supplementary_material": "/attachment/b55f3f4325eb8e6017efb2e95ccd5d8f6370afe2.zip", "author": "Xiao Liu;Yifan Zhou;Shuhei Ikemoto;Heni Ben Amor", "authorids": "~Xiao_Liu16;~Yifan_Zhou4;~Shuhei_Ikemoto1;~Heni_Ben_Amor4", "gender": "M;M;M;M", "homepage": "https://www.xiao-liu.me/;;;https://cidse.engineering.asu.edu/directory/ben-amor-heni/", "dblp": ";;;18/3990", "google_scholar": "Mu2VjAgAAAAJ;;https://scholar.google.co.jp/citations?user=lVzDBQcAAAAJ;https://scholar.google.com.tw/citations?user=u_7S7VYAAAAJ", "orcid": ";;;", "linkedin": ";yifan-zhou-639974191/;;", "or_profile": "~Xiao_Liu16;~Yifan_Zhou4;~Shuhei_Ikemoto1;~Heni_Amor1", "aff": "Arizona State University;Arizona State University;Kyushu Institute of Technology;Arizona State University", "aff_domain": "asu.edu;asu.edu;kyutech.ac.jp;asu.edu", "position": "PhD student;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2023alphamdf,\ntitle={\\${\\textbackslash}alpha\\$-{MDF}: An Attention-based Multimodal Differentiable Filter for Robot State Estimation},\nauthor={Xiao Liu and Yifan Zhou and Shuhei Ikemoto and Heni Ben Amor},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0hQMcWfjG9}\n}", "github": "https://github.com/ir-lab/alpha-MDF", "project": "", "reviewers": "X85h;zXri;UVgt", "site": "https://openreview.net/forum?id=0hQMcWfjG9", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;3", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13281927621147186529&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Arizona State University;Kyushu Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.asu.edu;https://www.kyutech.ac.jp", "aff_unique_abbr": "ASU;Kyutech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;Japan" }, { "id": "0mRSANSzEK", "title": "Improving Behavioural Cloning with Positive Unlabeled Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning control policies offline from pre-recorded datasets is a promising avenue for solving challenging real-world problems. However, available datasets are typically of mixed quality, with a limited number of the trajectories that we would consider as positive examples; i.e., high-quality demonstrations. Therefore, we propose a novel iterative learning algorithm for identifying expert trajectories in unlabeled mixed-quality robotics datasets given a minimal set of positive examples, surpassing existing algorithms in terms of accuracy. We show that applying behavioral cloning to the resulting filtered dataset outperforms several competitive offline reinforcement learning and imitation learning baselines. We perform experiments on a range of simulated locomotion tasks and on two challenging manipulation tasks on a real robotic system; in these experiments, our method showcases state-of-the-art performance. Our website: \\url{https://sites.google.com/view/offline-policy-learning-pubc}.", "keywords": "Offline policy learning;positive unlabeled learning;behavioural cloning", "primary_area": "", "supplementary_material": "/attachment/22285cf537d882f300221e609e6519dc666b812c.zip", "author": "Qiang Wang;Robert McCarthy;David Cordova Bulens;Kevin McGuinness;Noel E. O\u2019Connor;Francisco Roldan Sanchez;Nico G\u00fcrtler;Felix Widmaier;Stephen J. Redmond", "authorids": "~Qiang_Wang17;~Robert_McCarthy2;~David_Cordova_Bulens1;~Kevin_McGuinness1;noel.oconnor@insight-centre.org;~Francisco_Roldan_Sanchez1;~Nico_G\u00fcrtler1;~Felix_Widmaier1;stephen.redmond@ucd.ie", "gender": "M;;M;M;;M;M;;", "homepage": ";;https://www.davidcordovabulens.com/;http://www.eeng.dcu.ie/~mcguinne/;;https://scholar.google.com/citations?user=-Zy5WuAAAAAJ&hl=en&oi=ao;https://www.is.mpg.de/person/nguertler;https://is.tuebingen.mpg.de/person/felixwidmaier;", "dblp": ";;;;;;223/4063;;", "google_scholar": "ZPKVoaYAAAAJ;;;A8LTpP0AAAAJ;;;kuzmML4AAAAJ;;", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Qiang_Wang17;~Robert_McCarthy2;~David_Cordova_Bulens1;~Kevin_McGuinness1;noel.oconnor@insight-centre.org;~Francisco_Roldan_Sanchez1;~Nico_G\u00fcrtler1;~Felix_Widmaier1;stephen.redmond@ucd.ie", "aff": "University College Dublin;;;Dublin City University;;Insight Centre for Data Analytics;Max Planck Institute for Intelligent Systems, Max-Planck Institute;, Max Planck Institute for Intelligent Systems;", "aff_domain": "ucd.ie;;;dcu.ie;;insight-centre.org;tuebingen.mpg.de;is.tuebingen.mpg.de;", "position": "PhD student;;;Assistant Professor;;PhD student;PhD student;Research Engineer;", "bibtex": "@inproceedings{\nwang2023improving,\ntitle={Improving Behavioural Cloning with Positive Unlabeled Learning},\nauthor={Qiang Wang and Robert McCarthy and David Cordova Bulens and Kevin McGuinness and Noel E. O{\\textquoteright}Connor and Francisco Roldan Sanchez and Nico G{\\\"u}rtler and Felix Widmaier and Stephen J. Redmond},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0mRSANSzEK}\n}", "github": "", "project": "", "reviewers": "KCwR;MudH;oaHQ;aoto", "site": "https://openreview.net/forum?id=0mRSANSzEK", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;4;3;4", "rating_avg": 8.0, "confidence_avg": 3.75, "replies_avg": 13, "authors#_avg": 9, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2331462888603080127&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;3", "aff_unique_norm": "University College Dublin;Dublin City University;Insight Centre for Data Analytics;Max Planck Institute for Intelligent Systems", "aff_unique_dep": ";;;Intelligent Systems", "aff_unique_url": "https://www.ucd.ie;https://www.dcu.ie;https://insight-centre.org;https://www.mpi-is.mpg.de", "aff_unique_abbr": "UCD;DCU;;MPI-IS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;1", "aff_country_unique": "Ireland;Germany" }, { "id": "0o2JgvlzMUc", "title": "Deception Game: Closing the Safety-Learning Loop in Interactive Robot Autonomy", "track": "main", "status": "Poster", "tldr": "A novel safety analysis framework that closes the loop between the robot's prediction-planning-control pipeline and its runtime learning process.", "abstract": "An outstanding challenge for the widespread deployment of robotic systems like autonomous vehicles is ensuring safe interaction with humans without sacrificing performance. Existing safety methods often neglect the robot\u2019s ability to learn and adapt at runtime, leading to overly conservative behavior. This paper proposes a new closed-loop paradigm for synthesizing safe control policies that explicitly account for the robot\u2019s evolving uncertainty and its ability to quickly respond to future scenarios as they arise, by jointly considering the physical dynamics and the robot\u2019s learning algorithm. We leverage adversarial reinforcement learning for tractable safety analysis under high-dimensional learning dynamics and demonstrate our framework\u2019s ability to work with both Bayesian belief propagation and implicit learning through large pre-trained neural trajectory predictors.", "keywords": "Learning-Aware Safety Analysis;Active Information Gathering;Adversarial Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/7df7c0967a1e7de2424e87fbc8b50faec03d558d.zip", "author": "Haimin Hu;Zixu Zhang;Kensuke Nakamura;Andrea Bajcsy;Jaime Fern\u00e1ndez Fisac", "authorids": "~Haimin_Hu1;~Zixu_Zhang1;~Kensuke_Nakamura1;~Andrea_Bajcsy1;~Jaime_Fern\u00e1ndez_Fisac1", "gender": "M;M;;M;F", "homepage": "https://haiminhu.org/;https://zzx9636.github.io/;;https://ee.princeton.edu/people/jaime-fernandez-fisac;https://www.cs.cmu.edu/~abajcsy/", "dblp": "224/8581;https://dblp.org/rec/conf/rss/ZhangF21.html;;156/0109;208/0997", "google_scholar": "https://scholar.google.com/citations?hl=en;J1ub1esAAAAJ;https://scholar.google.ca/citations?hl=en;iAq_9tEAAAAJ;LUe32ToAAAAJ", "orcid": "0000-0002-4217-4776;;;0000-0002-2676-5090;", "linkedin": "haiminhu/;;;jaime-fisac-134341b0/;", "or_profile": "~Haimin_Hu1;~Zixu_Zhang1;~Kensuke_Nakamura1;~Jaime_Fern\u00e1ndez_Fisac1;~Andrea_Victoria_Bajcsy1", "aff": "Toyota Research Institute;Princeton University;Princeton University;Princeton University;University of California, Berkeley", "aff_domain": "tri.global;princeton.edu;princeton.edu;princeton.edu;berkeley.edu", "position": "Intern;PhD student;Undergrad student;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nhu2023deception,\ntitle={Deception Game: Closing the Safety-Learning Loop in Interactive Robot Autonomy},\nauthor={Haimin Hu and Zixu Zhang and Kensuke Nakamura and Andrea Bajcsy and Jaime Fern{\\'a}ndez Fisac},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=0o2JgvlzMUc}\n}", "github": "", "project": "", "reviewers": "BY31;dWyx;6ETr", "site": "https://openreview.net/forum?id=0o2JgvlzMUc", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1513940189984166364&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Toyota Research Institute;Princeton University;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tri.global;https://www.princeton.edu;https://www.berkeley.edu", "aff_unique_abbr": "TRI;Princeton;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "2Qrd-Yw4YmF", "title": "Sequential Dexterity: Chaining Dexterous Policies for Long-Horizon Manipulation", "track": "main", "status": "Poster", "tldr": "We present Sequential Dexterity, a system that learns to chain multiple dexterous manipulation policies for tackling long-horizon manipulation tasks in both simulation and real-world.", "abstract": "Many real-world manipulation tasks consist of a series of subtasks that are significantly different from one another. Such long-horizon, complex tasks highlight the potential of dexterous hands, which possess adaptability and versatility, capable of seamlessly transitioning between different modes of functionality without the need for re-grasping or external tools. However, the challenges arise due to the high-dimensional action space of dexterous hand and complex compositional dynamics of the long-horizon tasks. We present Sequential Dexterity, a general system based on reinforcement learning (RL) that chains multiple dexterous policies for achieving long-horizon task goals. The core of the system is a transition feasibility function that progressively finetunes the sub-policies for enhancing chaining success rate, while also enables autonomous policy-switching for recovery from failures and bypassing redundant stages. Despite being trained only in simulation with a few task objects, our system demonstrates generalization capability to novel object shapes and is able to zero-shot transfer to a real-world robot equipped with a dexterous hand. Code and videos are available at https://sequential-dexterity.github.io.", "keywords": "Dexterous Manipulation;Reinforcement Learning;Long-Horizon Manipulation", "primary_area": "", "supplementary_material": "/attachment/3797dad8165bb86e8f70ff0a677479416817c2a3.zip", "author": "Yuanpei Chen;Chen Wang;Li Fei-Fei;Karen Liu", "authorids": "~Yuanpei_Chen2;~Chen_Wang16;~Li_Fei-Fei1;~Karen_Liu1", "gender": "M;M;F;", "homepage": "https://cypypccpy.github.io/;http://www.chenwangjeremy.net/;https://profiles.stanford.edu/fei-fei-li;https://cs.stanford.edu/~karenliu", "dblp": "1234567;;79/2528;", "google_scholar": "https://scholar.google.com/citations?hl=en;lStkAzsAAAAJ;rDfyQnIAAAAJ;i28fU0MAAAAJ", "orcid": "0000-0002-0033-492X;;;0000-0001-5926-0905", "linkedin": ";;fei-fei-li-4541247/;", "or_profile": "~Yuanpei_Chen2;~Chen_Wang16;~Li_Fei-Fei1;~Karen_Liu1", "aff": "South China University of Technology;Computer Science Department, Stanford University;Stanford University;Computer Science Department, Stanford University", "aff_domain": "scut.edu.cn;cs.stanford.edu;stanford.edu;cs.stanford.edu", "position": "Undergrad student;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nchen2023sequential,\ntitle={Sequential Dexterity: Chaining Dexterous Policies for Long-Horizon Manipulation},\nauthor={Yuanpei Chen and Chen Wang and Li Fei-Fei and Karen Liu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=2Qrd-Yw4YmF}\n}", "github": "https://github.com/sequential-dexterity/SeqDex", "project": "", "reviewers": "8hyr;JYyi;7DGx;NBQ2", "site": "https://openreview.net/forum?id=2Qrd-Yw4YmF", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "5;4;5;4", "rating_avg": 7.5, "confidence_avg": 4.5, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": -0.19245008972987526, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10017318188344347779&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "South China University of Technology;Stanford University", "aff_unique_dep": ";Computer Science Department", "aff_unique_url": "https://www.scut.edu.cn;https://www.stanford.edu", "aff_unique_abbr": "SCUT;Stanford", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "China;United States" }, { "id": "2qKBwyLnln", "title": "Policy Stitching: Learning Transferable Robot Policies", "track": "main", "status": "Poster", "tldr": "We propose Policy Stitching, a novel framework to facilitate multi-task and multi-robot transfer.", "abstract": "Training robots with reinforcement learning (RL) typically involves heavy interactions with the environment, and the acquired skills are often sensitive to changes in task environments and robot kinematics. Transfer RL aims to leverage previous knowledge to accelerate learning of new tasks or new body configurations. However, existing methods struggle to generalize to novel robot-task combinations and scale to realistic tasks due to complex architecture design or strong regularization that limits the capacity of the learned policy. We propose Policy Stitching, a novel framework that facilitates robot transfer learning for novel combinations of robots and tasks. Our key idea is to apply modular policy design and align the latent representations between the modular interfaces. Our method allows direct stitching of the robot and task modules trained separately to form a new policy for fast adaptation. Our simulated and real-world experiments on various 3D manipulation tasks demonstrate the superior zero-shot and few-shot transfer learning performances of our method.", "keywords": "robot transfer learning;policy stitching", "primary_area": "", "supplementary_material": "/attachment/e957e5ce64858e5b0176a64e64c81e3e5796ab61.zip", "author": "Pingcheng Jian;Easop Lee;Zachary Bell;Michael M. Zavlanos;Boyuan Chen", "authorids": "~Pingcheng_Jian1;~Easop_Lee1;zachary.bell.10@us.af.mil;~Michael_M._Zavlanos2;~Boyuan_Chen1", "gender": "M;;;;Not Specified", "homepage": "https://pingcheng-jian.github.io/;;;;http://boyuanchen.com/", "dblp": "278/2436;;;;193/7174-1", "google_scholar": "2m63kY0AAAAJ;;;;5DBpY6EAAAAJ", "orcid": ";;;;", "linkedin": ";easoplee/;;;boyuan-chen-b30854a0/", "or_profile": "~Pingcheng_Jian1;~Easop_Lee1;zachary.bell.10@us.af.mil;~Michael_M._Zavlanos2;~Boyuan_Chen1", "aff": "Duke University;Duke University;;;Duke University", "aff_domain": "duke.edu;duke.edu;;;duke.edu", "position": "PhD student;PhD student;;;Assistant Professor", "bibtex": "@inproceedings{\njian2023policy,\ntitle={Policy Stitching: Learning Transferable Robot Policies},\nauthor={Pingcheng Jian and Easop Lee and Zachary Bell and Michael M. Zavlanos and Boyuan Chen},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=2qKBwyLnln}\n}", "github": "https://github.com/general-robotics-duke/Policy-Stitching", "project": "", "reviewers": "ZQPk;DSoM;BozE;2mFg", "site": "https://openreview.net/forum?id=2qKBwyLnln", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "3;4;4;4", "rating_avg": 6.5, "confidence_avg": 3.75, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.6622661785325219, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17239718148370495152&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "32c8pl84_uD", "title": "Marginalized Importance Sampling for Off-Environment Policy Evaluation", "track": "main", "status": "Poster", "tldr": "We propose a method of validating real-world performance of a robot using just the robot simulator and limited offline data", "abstract": "Reinforcement Learning (RL) methods are typically sample-inefficient, making it challenging to train and deploy RL-policies in real world robots. Even a robust policy trained in simulation requires a real-world deployment to assess their performance. This paper proposes a new approach to evaluate the real-world performance of agent policies prior to deploying them in the real world. Our approach incorporates a simulator along with real-world offline data to evaluate the performance of any policy using the framework of Marginalized Importance Sampling (MIS). Existing MIS methods face two challenges: (1) large density ratios that deviate from a reasonable range and (2) indirect supervision, where the ratio needs to be inferred indirectly, thus exacerbating estimation error. Our approach addresses these challenges by introducing the target policy's occupancy in the simulator as an intermediate variable and learning the density ratio as the product of two terms that can be learned separately. The first term is learned with direct supervision and the second term has a small magnitude, thus making it computationally efficient. We analyze the sample complexity as well as error propagation of our two step-procedure. Furthermore, we empirically evaluate our approach on Sim2Sim environments such as Cartpole, Reacher, and Half-Cheetah. Our results show that our method generalizes well across a variety of Sim2Sim gap, target policies and offline data collection policies. We also demonstrate the performance of our algorithm on a Sim2Real task of validating the performance of a 7 DoF robotic arm using offline data along with the Gazebo simulator.", "keywords": "Sim2Real;Policy Evaluation;Robot Validation", "primary_area": "", "supplementary_material": "/attachment/524bf2626a95a3d3bc8d9fd3e1023bc22f6e0ffc.zip", "author": "Pulkit Katdare;Nan Jiang;Katherine Rose Driggs-Campbell", "authorids": "~Pulkit_Katdare1;~Nan_Jiang2;~Katherine_Rose_Driggs-Campbell1", "gender": "M;M;", "homepage": "https://pulkitkatdare.web.illinois.edu/;http://nanjiang.cs.illinois.edu;", "dblp": "198/0632;06/4489-8;", "google_scholar": "yC1tsz8AAAAJ;nUlanA8AAAAJ;", "orcid": ";;", "linkedin": ";nan-jiang-28139937/;", "or_profile": "~Pulkit_Katdare1;~Nan_Jiang2;~Katherine_Rose_Driggs-Campbell1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;", "aff_domain": "illinois.edu;illinois.edu;", "position": "PhD student;Assistant Professor;", "bibtex": "@inproceedings{\nkatdare2023marginalized,\ntitle={Marginalized Importance Sampling for Off-Environment Policy Evaluation},\nauthor={Pulkit Katdare and Nan Jiang and Katherine Rose Driggs-Campbell},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=32c8pl84_uD}\n}", "github": "https://github.com/pulkitkatdare/mis_off_env_eval", "project": "", "reviewers": "iwBL;VF29;1Zrs;dLGh", "site": "https://openreview.net/forum?id=32c8pl84_uD", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;2;4;4", "rating_avg": 5.5, "confidence_avg": 3.25, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.17407765595569782, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11094221165019480695&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "3gh9hf3R6x", "title": "Robot Learning with Sensorimotor Pre-training", "track": "main", "status": "Oral", "tldr": "", "abstract": "We present a self-supervised sensorimotor pre-training approach for robotics. Our model, called RPT, is a Transformer that operates on sequences of sensorimotor tokens. Given a sequence of camera images, proprioceptive robot states, and actions, we encode the sequence into tokens, mask out a subset, and train a model to predict the missing content from the rest. We hypothesize that if a robot can predict the masked-out content it will have acquired a good model of the physical world that can enable it to act. RPT is designed to operate on latent visual representations which makes prediction tractable, enables scaling to larger models, and allows fast inference on a real robot. To evaluate our approach, we collected a dataset of 20,000 real-world trajectories over 9 months using a combination of motion planning and grasping algorithms. We find that sensorimotor pre-training consistently outperforms training from scratch, has favorable scaling properties, and enables transfer across different tasks, environments, and robots.", "keywords": "Robot Learning;Self-supervised;Sensorimotor;Pre-training", "primary_area": "", "supplementary_material": "", "author": "Ilija Radosavovic;Baifeng Shi;Letian Fu;Ken Goldberg;Trevor Darrell;Jitendra Malik", "authorids": "~Ilija_Radosavovic1;~Baifeng_Shi1;~Letian_Fu1;~Ken_Goldberg1;~Trevor_Darrell2;~Jitendra_Malik2", "gender": "M;;M;M;M;M", "homepage": "https://people.eecs.berkeley.edu/~ilija;https://bfshi.github.io;https://max-fu.github.io/;http://goldberg.berkeley.edu/;https://people.eecs.berkeley.edu/~malik/;https://people.eecs.berkeley.edu/~trevor/", "dblp": "211/6740;261/9376;;g/KennethYGoldberg;58/2944;d/TrevorDarrell", "google_scholar": "UKpinl8AAAAJ;LBEIm8gAAAAJ;aWot7UgAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ;oY9R5YQAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";;;0000-0001-6747-9499;0000-0003-3695-1580;", "linkedin": ";baifeng-shi-09171b188/;;goldbergken/;;", "or_profile": "~Ilija_Radosavovic1;~Baifeng_Shi1;~Letian_Fu1;~Ken_Goldberg1;~Jitendra_Malik2;~trevor_darrell1", "aff": "University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department", "aff_domain": "berkeley.edu;berkeley.edu;eecs.berkeley.edu;berkeley.edu;berkeley.edu;eecs.berkeley.edu", "position": "PhD student;PhD student;MS student;Full Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nradosavovic2023robot,\ntitle={Robot Learning with Sensorimotor Pre-training},\nauthor={Ilija Radosavovic and Baifeng Shi and Letian Fu and Ken Goldberg and Trevor Darrell and Jitendra Malik},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=3gh9hf3R6x}\n}", "github": "https://github.com/ir413/rpt", "project": "", "reviewers": "8qUw;oCTA;Q5ex;6uLm", "site": "https://openreview.net/forum?id=3gh9hf3R6x", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "5;3;3;4", "rating_avg": 8.0, "confidence_avg": 3.75, "replies_avg": 20, "authors#_avg": 6, "corr_rating_confidence": -0.30151134457776363, "gs_citation": 54, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8207131344459697639&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "3uwj8QZROL", "title": "Scaling Up and Distilling Down: Language-Guided Robot Skill Acquisition", "track": "main", "status": "Poster", "tldr": "We present a framework for language-model guided data generation over robot utilities, and multi-task language-conditioned diffusion policy distillation.", "abstract": "We present a framework for robot skill acquisition, which 1) efficiently scale up data generation of language-labelled robot data and 2) effectively distills this data down into a robust multi-task language-conditioned visuo-motor policy. For (1), we use a large language model (LLM) to guide high-level planning, and sampling-based robot planners (e.g. motion or grasp samplers) for generating diverse and rich manipulation trajectories. To robustify this data-collection process, the LLM also infers a code-snippet for the success condition of each task, simultaneously enabling the data-collection process to detect failure and retry as well as the automatic labeling of trajectories with success/failure. For (2), we extend the diffusion policy single-task behavior-cloning approach to multi-task settings with language conditioning. Finally, we propose a new multi-task benchmark with 18 tasks across five domains to test long-horizon behavior, common-sense reasoning, tool-use, and intuitive physics. We find that our distilled policy successfully learned the robust retrying behavior in its data collection procedure, while improving absolute success rates by 33.2% on average across five domains. Code, data, and additional qualitative results are available on https://www.cs.columbia.edu/~huy/scalingup/.", "keywords": "skill learning;language;diffusion", "primary_area": "", "supplementary_material": "/attachment/9b4d34cc8fe2a1e676ac8d87266ad6f58800abac.zip", "author": "Huy Ha;Pete Florence;Shuran Song", "authorids": "~Huy_Ha1;~Pete_Florence1;~Shuran_Song3", "gender": "M;;F", "homepage": "https://www.cs.columbia.edu/~huy/;http://www.peteflorence.com/;https://shurans.github.io/", "dblp": "277/9554;;", "google_scholar": "-3-f_8YAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Huy_Ha1;~Pete_Florence1;~Shuran_Song3", "aff": "Columbia University;Google;Columbia University", "aff_domain": "columbia.edu;google.com;cs.columbia.edu", "position": "PhD student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nha2023scaling,\ntitle={Scaling Up and Distilling Down: Language-Guided Robot Skill Acquisition},\nauthor={Huy Ha and Pete Florence and Shuran Song},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=3uwj8QZROL}\n}", "github": "https://github.com/real-stanford/scalingup", "project": "", "reviewers": "kQcE;bJic;t1nk", "site": "https://openreview.net/forum?id=3uwj8QZROL", "pdf_size": 0, "rating": "4;4;10", "confidence": "4;3;5", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844387, "gs_citation": 170, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8641471144667957157&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0", "aff_unique_norm": "Columbia University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.columbia.edu;https://www.google.com", "aff_unique_abbr": "Columbia;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "44FPaVRWkbl", "title": "DORT: Modeling Dynamic Objects in Recurrent for Multi-Camera 3D Object Detection and Tracking", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent multi-camera 3D object detectors usually leverage temporal information to construct multi-view stereo that alleviates the ill-posed depth estimation. However, they typically assume all the objects are static and directly aggregate features across frames. This work begins with a theoretical and empirical analysis to reveal that ignoring the motion of moving objects can result in serious localization bias. Therefore, we propose to model Dynamic Objects in RecurrenT (DORT) to tackle this problem. In contrast to previous global BirdEye-View (BEV) methods, DORT extracts object-wise local volumes for motion estimation that also alleviates the heavy computational burden. By iteratively refining the estimated object motion and location, the preceding features can be precisely aggregated to the current frame to mitigate the aforementioned adverse effects. The simple framework has two significant appealing properties. It is flexible and practical that can be plugged into most camera-based 3D object detectors. As there are predictions of object motion in the loop, it can easily track objects across frames according to their nearest center distances. Without bells and whistles, DORT outperforms all the previous methods on the nuScenes detection and tracking benchmarks with 62.8% NDS and 57.6% AMOTA, respectively. The source code will be available at https://github.com/OpenRobotLab/DORT.", "keywords": "Temporal Modeling;3D Object Detection", "primary_area": "", "supplementary_material": "/attachment/5fbd636fb0d4fa7a7f386f5822ecd94022154cf7.zip", "author": "Qing LIAN;Tai Wang;Dahua Lin;Jiangmiao Pang", "authorids": "~Qing_LIAN3;~Tai_Wang2;~Dahua_Lin1;~Jiangmiao_Pang1", "gender": "M;M;M;M", "homepage": "https://www.lianqing11.github.io;http://dahua.site;https://oceanpang.github.io/;https://tai-wang.github.io/", "dblp": "234/4406;53/6088;231/7630;", "google_scholar": ";GMzzRRUAAAAJ;https://scholar.google.com/citations?authuser=0;JmbbZWIAAAAJ", "orcid": ";;0000-0002-6711-9319;", "linkedin": ";;;%E6%B3%B0-%E7%8E%8B-2b2738147/", "or_profile": "~Qing_LIAN3;~Dahua_Lin1;~Jiangmiao_Pang1;~Tai_WANG1", "aff": "Hong Kong University of Science and Technology;The Chinese University of Hong Kong;Shanghai AI Laboratory ;The Chinese University of Hong Kong", "aff_domain": "ust.hk;cuhk.edu.hk;pjlab.org.cn;cuhk.edu.hk", "position": "PhD student;Associate Professor;Research Scientist;PhD student", "bibtex": "@inproceedings{\nlian2023dort,\ntitle={{DORT}: Modeling Dynamic Objects in Recurrent for Multi-Camera 3D Object Detection and Tracking},\nauthor={Qing LIAN and Tai Wang and Dahua Lin and Jiangmiao Pang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=44FPaVRWkbl}\n}", "github": "https://github.com/OpenRobotLab/DORT", "project": "", "reviewers": "wd6g;RSf9;VMsm;fWSU", "site": "https://openreview.net/forum?id=44FPaVRWkbl", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;3;4;3", "rating_avg": 5.5, "confidence_avg": 3.25, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14700063457685893921&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Chinese University of Hong Kong;Shanghai AI Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ust.hk;https://www.cuhk.edu.hk;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "HKUST;CUHK;SAIL", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "48qUHKUEdBf", "title": "STOW: Discrete-Frame Segmentation and Tracking of Unseen Objects for Warehouse Picking Robots", "track": "main", "status": "Poster", "tldr": "We formulated the the task of unseen object segmentation and tracking in discrete frame which is essential for warehouse picking robots and proposed a novel method STOW which show significant improvements compared with its counterparts on this task.", "abstract": "Segmentation and tracking of unseen object instances in discrete frames pose a significant challenge in dynamic industrial robotic contexts, such as distribution warehouses. Here, robots must handle object rearrangements, including shifting, removal, and partial occlusion by new items, and track these items after substantial temporal gaps. The task is further complicated when robots encounter objects beyond their training sets, thereby requiring the ability to segment and track previously unseen items. Considering that continuous observation is often inaccessible in such settings, our task involves working with a discrete set of frames separated by indefinite periods, during which substantial changes to the scene may occur. This task also translates to domestic robotic applications, such as table rearrangement. To address these demanding challenges, we introduce new synthetic and real-world datasets that replicate these industrial and household scenarios. Furthermore, we propose a novel paradigm for joint segmentation and tracking in discrete frames, alongside a transformer module that facilitates efficient inter-frame communication. Our approach significantly outperforms recent methods in our experiments. For additional results and videos, please visit \\url{https://sites.google.com/view/stow-corl23}. Code and dataset will be released.", "keywords": "Unseen Object Instance Segmentation;Unsupervised Multi Object Tracking;Zero-shot;Discrete Frames", "primary_area": "", "supplementary_material": "/attachment/dd9bea36a3ab0dcab566382c690349161721fc91.zip", "author": "Yi Li;Muru Zhang;Markus Grotz;Kaichun Mo;Dieter Fox", "authorids": "~Yi_Li9;~Muru_Zhang1;~Markus_Grotz1;~Kaichun_Mo1;~Dieter_Fox1", "gender": "M;M;M;M;M", "homepage": "https://yili.vision;https://nanami18.github.io/;;https://cs.stanford.edu/~kaichun/;https://homes.cs.washington.edu/~fox/", "dblp": "59/871-38;325/4648.html;173/7849;172/1283;f/DieterFox", "google_scholar": "MW36lZUAAAAJ;OJIXk7wAAAAJ;https://scholar.google.de/citations?user=ywTBxOkAAAAJ;pL7JsOsAAAAJ;DqXsbPAAAAAJ", "orcid": ";;0000-0001-7257-5872;;", "linkedin": "yi-li-9846989a/;muruzhang/;markus-grotz-75b55ab4/;;", "or_profile": "~Yi_Li9;~Muru_Zhang1;~Markus_Grotz1;~Kaichun_Mo1;~Dieter_Fox1", "aff": "Department of Computer Science, University of Washington;University of Washington;University of Washington;NVIDIA;Department of Computer Science", "aff_domain": "cs.washington.edu;cs.washington.edu;uw.edu;nvidia.com;cs.washington.edu", "position": "PhD student;MS student;Postdoc;Researcher;Full Professor", "bibtex": "@inproceedings{\nli2023stow,\ntitle={{STOW}: Discrete-Frame Segmentation and Tracking of Unseen Objects for Warehouse Picking Robots},\nauthor={Yi Li and Muru Zhang and Markus Grotz and Kaichun Mo and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=48qUHKUEdBf}\n}", "github": "https://sites.google.com/view/stow-corl23", "project": "", "reviewers": "ySeC;izjm;Bc1q", "site": "https://openreview.net/forum?id=48qUHKUEdBf", "pdf_size": 0, "rating": "4;6;6", "confidence": "2;3;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.0, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13247034123312888102&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of Washington;NVIDIA;Unknown Institution", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.washington.edu;https://www.nvidia.com;", "aff_unique_abbr": "UW;NVIDIA;", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "4ZK8ODNyFXx", "title": "Robots That Ask For Help: Uncertainty Alignment for Large Language Model Planners", "track": "main", "status": "Oral", "tldr": "", "abstract": "Large language models (LLMs) exhibit a wide range of promising capabilities --- from step-by-step planning to commonsense reasoning --- that may provide utility for robots, but remain prone to confidently hallucinated predictions. In this work, we present KnowNo, a framework for measuring and aligning the uncertainty of LLM-based planners, such that they know when they don't know, and ask for help when needed. KnowNo builds on the theory of conformal prediction to provide statistical guarantees on task completion while minimizing human help in complex multi-step planning settings. Experiments across a variety of simulated and real robot setups that involve tasks with different modes of ambiguity (for example, from spatial to numeric uncertainties, from human preferences to Winograd schemas) show that KnowNo performs favorably over modern baselines (which may involve ensembles or extensive prompt tuning) in terms of improving efficiency and autonomy, while providing formal assurances. KnowNo can be used with LLMs out-of-the-box without model-finetuning, and suggests a promising lightweight approach to modeling uncertainty that can complement and scale with the growing capabilities of foundation models.", "keywords": "Language-based planning;uncertainty estimation;conformal prediction", "primary_area": "", "supplementary_material": "/attachment/a05fdb3a6cf74a3c0583c570c024bc57fcf88202.zip", "author": "Allen Z. Ren;Anushri Dixit;Alexandra Bodrova;Sumeet Singh;Stephen Tu;Noah Brown;Peng Xu;Leila Takayama;Fei Xia;Jake Varley;Zhenjia Xu;Dorsa Sadigh;Andy Zeng;Anirudha Majumdar", "authorids": "~Allen_Z._Ren1;~Anushri_Dixit1;~Alexandra_Bodrova1;~Sumeet_Singh3;~Stephen_Tu1;~Noah_Brown1;~Peng_Xu9;~Leila_Takayama1;~Fei_Xia1;~Jake_Varley1;~Zhenjia_Xu1;~Dorsa_Sadigh1;~Andy_Zeng3;~Anirudha_Majumdar1", "gender": "M;F;F;M;;M;M;F;M;M;M;F;M;M", "homepage": "http://allenzren.github.io/;https://www.anushridixit.com/;https://www.linkedin.com/in/alexandra-bodrova-435593152/;;https://stephentu.github.io/;;;https://www.leilatakayama.org;;http://www.cs.columbia.edu/~jvarley/;https://www.zhenjiaxu.com/;https://dorsa.fyi/;https://irom-lab.princeton.edu/majumdar/;http://andyzeng.github.io/", "dblp": ";;;;09/8165;;;;;;238/0000;117/3174;116/6436;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy", "google_scholar": "mgMzkYMAAAAJ;ADThnCAAAAAJ;;ZGpE5cYAAAAJ;JQcDmB8AAAAJ;wHuVMCkAAAAJ;460NWeQAAAAJ;yFEHsv4AAAAJ;pqP5_PgAAAAJ;UJcm1MoAAAAJ;QE8cLMEAAAAJ;ZaJEZpYAAAAJ;ibu3FwsAAAAJ;q7nFtUcAAAAJ", "orcid": ";;;;;;;;0000-0003-4343-1444;;;;;", "linkedin": "allenzren/;;;;;;;;;;;;;", "or_profile": "~Allen_Z._Ren1;~Anushri_Dixit1;~Alexandra_Bodrova1;~Sumeet_Singh3;~Stephen_Tu1;~Noah_Brown1;~Peng_Xu9;~Leila_Takayama1;~Fei_Xia1;~Jake_Varley1;~Zhenjia_Xu1;~Dorsa_Sadigh1;~Anirudha_Majumdar1;~Andy_Zeng1", "aff": "Google DeepMind;California Institute of Technology;Princeton University;Google Brain Robotics;Google;Research, Google;Google;;Google;Google;Columbia University;Stanford University;Princeton University;Google", "aff_domain": "google.com;caltech.edu;princeton.edu;google.com;google.com;research.google.com;google.com;;google.com;google.com;columbia.edu;stanford.edu;princeton.edu;google.com", "position": "Intern;PhD student;PhD student;Researcher;Researcher;Researcher;Researcher;;Researcher;Engineer;PhD student;Assistant Professor;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nren2023robots,\ntitle={Robots That Ask For Help: Uncertainty Alignment for Large Language Model Planners},\nauthor={Allen Z. Ren and Anushri Dixit and Alexandra Bodrova and Sumeet Singh and Stephen Tu and Noah Brown and Peng Xu and Leila Takayama and Fei Xia and Jake Varley and Zhenjia Xu and Dorsa Sadigh and Andy Zeng and Anirudha Majumdar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=4ZK8ODNyFXx}\n}", "github": "https://github.com/google-research/google-research/tree/master/language_model_uncertainty", "project": "", "reviewers": "4oDG;fuC2;mihK;aKea", "site": "https://openreview.net/forum?id=4ZK8ODNyFXx", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;5;5;4", "rating_avg": 8.0, "confidence_avg": 4.5, "replies_avg": 20, "authors#_avg": 14, "corr_rating_confidence": 0.0, "gs_citation": 248, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=751643712331399494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;2;0;0;0;0;0;0;3;4;2;0", "aff_unique_norm": "Google;California Institute of Technology;Princeton University;Columbia University;Stanford University", "aff_unique_dep": "Google DeepMind;;;;", "aff_unique_url": "https://deepmind.com;https://www.caltech.edu;https://www.princeton.edu;https://www.columbia.edu;https://www.stanford.edu", "aff_unique_abbr": "DeepMind;Caltech;Princeton;Columbia;Stanford", "aff_campus_unique_index": "1;2;2;2;2;2;2;3;2", "aff_campus_unique": ";Pasadena;Mountain View;Stanford", "aff_country_unique_index": "0;1;1;1;1;1;1;1;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "4uFVn6WHyzo", "title": "Generating Transferable Adversarial Simulation Scenarios for Self-Driving via Neural Rendering", "track": "main", "status": "Poster", "tldr": "Adversarial attacks for autonomous driving using differentiable surrogate simulators", "abstract": "Self-driving software pipelines include components that are learned from a significant number of training examples, yet it remains challenging to evaluate the overall system's safety and generalization performance. Together with scaling up the real-world deployment of autonomous vehicles, it is of critical importance to automatically find simulation scenarios where the driving policies will fail. We propose a method that efficiently generates adversarial simulation scenarios for autonomous driving by solving an optimal control problem that aims to maximally perturb the policy from its nominal trajectory. Given an image-based driving policy, we show that we can inject new objects in a neural rendering representation of the deployment scene, and optimize their texture in order to generate adversarial sensor inputs to the policy. We demonstrate that adversarial scenarios discovered purely in the neural renderer (surrogate scene) can often be successfully transferred to the deployment scene, without further optimization. We demonstrate this transfer occurs both in simulated and real environments, provided the learned surrogate scene is sufficiently close to the deployment scene.", "keywords": "robotics;adversarial attacks;simulation", "primary_area": "", "supplementary_material": "/attachment/3a65261ca91c063bd0ecb32ef8829ec8220f1122.zip", "author": "Yasasa Abeysirigoonawardena;Kevin Xie;Chuhan Chen;Salar Hosseini Khorasgani;Ruiting Chen;Ruiqi Wang;Florian Shkurti", "authorids": "~Yasasa_Abeysirigoonawardena1;~Kevin_Xie1;~Chuhan_Chen1;~Salar_Hosseini_Khorasgani1;ruiting.chen@mail.utoronto.ca;~Ruiqi_Wang5;~Florian_Shkurti1", "gender": "M;M;F;M;;;M", "homepage": "https://yasasa.me;https://kevincxie.github.io;https://sally-chen.github.io/;;;;http://www.cs.toronto.edu/~florian/", "dblp": ";162/1953;345/7921;323/4107;;;21/10333", "google_scholar": ";04dL0akAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.ca/citations?user=8OT5mY0AAAAJ;;;https://scholar.google.ca/citations?hl=en", "orcid": ";;;;;;", "linkedin": "yasasa-abeysirigoonawardena-819229198/;;chuhan-chen/;salar-hosseini/;;ruiqi-wang-3b970b150;", "or_profile": "~Yasasa_Abeysirigoonawardena1;~Kevin_Xie1;~Chuhan_Chen1;~Salar_Hosseini_Khorasgani1;ruiting.chen@mail.utoronto.ca;~Ruiqi_Wang5;~Florian_Shkurti1", "aff": "Department of Computer Science;Department of Computer Science, University of Toronto;Flawless AI Inc.;Toronto University;;Stanford University;University of Toronto", "aff_domain": "cs.toronto.edu;cs.toronto.edu;flawlessai.com;utoronto.ca;;stanford.edu;cs.toronto.edu", "position": "MS student;PhD student;Intern;MS student;;MS student;Assistant Professor", "bibtex": "@inproceedings{\nabeysirigoonawardena2023generating,\ntitle={Generating Transferable Adversarial Simulation Scenarios for Self-Driving via Neural Rendering},\nauthor={Yasasa Abeysirigoonawardena and Kevin Xie and Chuhan Chen and Salar Hosseini Khorasgani and Ruiting Chen and Ruiqi Wang and Florian Shkurti},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=4uFVn6WHyzo}\n}", "github": "", "project": "", "reviewers": "Hzbf;qikM;i9eu;8f1g", "site": "https://openreview.net/forum?id=4uFVn6WHyzo", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;4;4", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2260346118393523921&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1;3;1", "aff_unique_norm": "Unknown Institution;University of Toronto;Flawless AI Inc.;Stanford University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;;", "aff_unique_url": ";https://www.utoronto.ca;;https://www.stanford.edu", "aff_unique_abbr": ";U of T;;Stanford", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Toronto;Stanford", "aff_country_unique_index": "1;2;1;2;1", "aff_country_unique": ";Canada;United States" }, { "id": "4x2RUQ99sGz", "title": "Online Model Adaptation with Feedforward Compensation", "track": "main", "status": "Poster", "tldr": "This work proposes an online adaptation method with feedforward compensation.", "abstract": "To cope with distribution shifts or non-stationarity in system dynamics, online adaptation algorithms have been introduced to update offline-learned prediction models in real-time. Existing online adaptation methods focus on optimizing the prediction model by utilizing feedback from the latest prediction error. Unfortunately, this feedback-based approach is susceptible to forgetting past information. This work proposes an online adaptation method with feedforward compensation, which uses critical data samples from a memory buffer, instead of the latest samples, to optimize the prediction model. We prove that the proposed approach achieves a smaller error bound compared to previously utilized methods in slow time-varying systems. We conducted experiments on several prediction tasks, which clearly illustrate the superiority of the proposed feedforward adaptation method. Furthermore, our feedforward adaptation technique is capable of estimating an uncertainty bound for predictions.", "keywords": "Online Adaptation;Optimization;Behavior prediction", "primary_area": "", "supplementary_material": "/attachment/cf251bf77f504f09fb1cc9baf6fa589739e44487.zip", "author": "ABULIKEMU ABUDUWEILI;Changliu Liu", "authorids": "~ABULIKEMU_ABUDUWEILI1;~Changliu_Liu1", "gender": "M;F", "homepage": "https://walleclipse.github.io/;http://www.cs.cmu.edu/~cliu6/index.html", "dblp": "245/8652;166/3563", "google_scholar": "6Oro5g8AAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~ABULIKEMU_ABUDUWEILI1;~Changliu_Liu1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nabuduweili2023online,\ntitle={Online Model Adaptation with Feedforward Compensation},\nauthor={ABULIKEMU ABUDUWEILI and Changliu Liu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=4x2RUQ99sGz}\n}", "github": "https://github.com/intelligent-control-lab/Feedforward_Adaptation", "project": "", "reviewers": "yQR8;yPZ4;GpzZ;sMeX", "site": "https://openreview.net/forum?id=4x2RUQ99sGz", "pdf_size": 0, "rating": "4;4;4;10", "confidence": "4;3;3;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1768032192631146689&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "5JMGq83yf1N", "title": "Ready, Set, Plan! Planning to Goal Sets Using Generalized Bayesian Inference", "track": "main", "status": "Poster", "tldr": "We propose a goal set planner which plans to uncertain goal regions using demonstrations of valid goals.", "abstract": "Many robotic tasks can have multiple and diverse solutions and, as such, are naturally expressed as goal sets. Examples include navigating to a room, finding a feasible placement location for an object, or opening a drawer enough to reach inside. Using a goal set as a planning objective requires that a model for the objective be explicitly given by the user. However, some goals are intractable to model, leading to uncertainty over the goal (e.g. stable grasping of an object). In this work, we propose a technique for planning directly to a set of sampled goal configurations. We formulate a planning as inference problem with a novel goal likelihood evaluated against the goal samples. To handle the intractable goal likelihood, we employ Generalized Bayesian Inference to approximate the trajectory distribution. The result is a fully differentiable cost which generalizes across a diverse range of goal set objectives for which samples can be obtained. We show that by considering all goal samples throughout the planning process, our method reliably finds plans on manipulation and navigation problems where heuristic approaches fail.", "keywords": "Planning as inference;Variational inference;Nonparametric learning", "primary_area": "", "supplementary_material": "/attachment/a9ef969e5bc68c0ecebd18544d355266054a0cea.zip", "author": "Jana Pavlasek;Stanley Robert Lewis;Balakumar Sundaralingam;Fabio Ramos;Tucker Hermans", "authorids": "~Jana_Pavlasek1;~Stanley_Robert_Lewis1;~Balakumar_Sundaralingam1;~Fabio_Ramos1;~Tucker_Hermans2", "gender": "F;;M;M;M", "homepage": "http://janapavlasek.com;;https://balakumar-s.github.io/;https://fabioramos.github.io/;https://robot-learning.cs.utah.edu", "dblp": ";149/8180.html;;22/2488;https://dblp.uni-trier.de/pid/67/4241", "google_scholar": "https://scholar.google.ca/citations?user=yJS-u7IAAAAJ;D8nfug0AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com.au/citations?user=T_mJiHoAAAAJ;G5_VFfkAAAAJ", "orcid": "0000-0001-6332-2646;;;;0000-0003-2496-2768", "linkedin": ";stanley-lewis-79a89183/;;fabio-ramos-3256b421/;", "or_profile": "~Jana_Pavlasek1;~Stanley_Robert_Lewis1;~Balakumar_Sundaralingam1;~Fabio_Ramos1;~Tucker_Hermans2", "aff": "University of Michigan;University of Michigan;NVIDIA;NVIDIA;University of Utah", "aff_domain": "umich.edu;umich.edu;nvidia.com;nvidia.com;utah.edu", "position": "PhD student;PhD student;Research Scientist;Principal Research Scientist;Associate Professor", "bibtex": "@inproceedings{\npavlasek2023ready,\ntitle={Ready, Set, Plan! Planning to Goal Sets Using Generalized Bayesian Inference},\nauthor={Jana Pavlasek and Stanley Robert Lewis and Balakumar Sundaralingam and Fabio Ramos and Tucker Hermans},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=5JMGq83yf1N}\n}", "github": "", "project": "", "reviewers": "aNNG;CHoH;bCZS;Jat5", "site": "https://openreview.net/forum?id=5JMGq83yf1N", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;3;4", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9166872698524570276&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;1;2", "aff_unique_norm": "University of Michigan;NVIDIA;University of Utah", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.umich.edu;https://www.nvidia.com;https://www.utah.edu", "aff_unique_abbr": "UM;NVIDIA;Utah", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "69y5fzvaAT", "title": "RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools", "track": "main", "status": "Oral", "tldr": "RoboCook perceives, models, and manipulates elasto-plastic objects for long-horizon tasks like making dumplings and alphabet letter cookies.", "abstract": "Humans excel in complex long-horizon soft body manipulation tasks via flexible tool use: bread baking requires a knife to slice the dough and a rolling pin to flatten it. Often regarded as a hallmark of human cognition, tool use in autonomous robots remains limited due to challenges in understanding tool-object interactions. Here we develop an intelligent robotic system, RoboCook, which perceives, models, and manipulates elasto-plastic objects with various tools. RoboCook uses point cloud scene representations, models tool-object interactions with Graph Neural Networks (GNNs), and combines tool classification with self-supervised policy learning to devise manipulation plans. We demonstrate that from just 20 minutes of real-world interaction data per tool, a general-purpose robot arm can learn complex long-horizon soft object manipulation tasks, such as making dumplings and alphabet letter cookies. Extensive evaluations show that RoboCook substantially outperforms state-of-the-art approaches, exhibits robustness against severe external disturbances, and demonstrates adaptability to different materials.", "keywords": "Deformable Object Manipulation;Long-horizon Planning;Model Learning;Tool Usage", "primary_area": "", "supplementary_material": "/attachment/c5bf181c52830681f20be08059c798be5e686896.zip", "author": "Haochen Shi;Huazhe Xu;Samuel Clarke;Yunzhu Li;Jiajun Wu", "authorids": "~Haochen_Shi2;~Huazhe_Xu1;~Samuel_Clarke1;~Yunzhu_Li1;~Jiajun_Wu1", "gender": "M;M;;M;M", "homepage": "https://hshi74.github.io/;http://hxu.rocks;;https://yunzhuli.github.io/;https://jiajunwu.com", "dblp": ";164/9006;;182/1831;117/4768", "google_scholar": "https://scholar.google.com/citations?hl=en;t9HPFawAAAAJ;;WlA92lcAAAAJ;2efgcS0AAAAJ", "orcid": "0000-0002-3604-465X;;;;0000-0002-4176-343X", "linkedin": ";;;;jiajunwu/", "or_profile": "~Haochen_Shi2;~Huazhe_Xu1;~Samuel_Clarke1;~Yunzhu_Li1;~Jiajun_Wu1", "aff": "Stanford University;Tsinghua University;;Stanford University;Stanford University", "aff_domain": "stanford.edu;tsinghua.edu.cn;;stanford.edu;stanford.edu", "position": "MS student;Assistant Professor;;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nshi2023robocook,\ntitle={RoboCook: Long-Horizon Elasto-Plastic Object Manipulation with Diverse Tools},\nauthor={Haochen Shi and Huazhe Xu and Samuel Clarke and Yunzhu Li and Jiajun Wu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=69y5fzvaAT}\n}", "github": "https://github.com/hshi74/robocook", "project": "", "reviewers": "UkA9;VnEP;bNTe;WdqP", "site": "https://openreview.net/forum?id=69y5fzvaAT", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "5;4;4;3", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": -0.7071067811865475, "gs_citation": 62, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7104147273386997637&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Stanford University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "Stanford;THU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "6Um8P8Fvyhl", "title": "MOTO: Offline Pre-training to Online Fine-tuning for Model-based Robot Learning", "track": "main", "status": "Poster", "tldr": "We develop a model-based RL method specifically designed for online fine-tuning of robot tasks. MOTO is the first method to solve the Franka Kitchen environment from images.", "abstract": "We study the problem of offline pre-training and online fine-tuning for reinforcement learning from high-dimensional observations in the context of realistic robot tasks. Recent offline model-free approaches successfully use online fine-tuning to either improve the performance of the agent over the data collection policy or adapt to novel tasks. At the same time, model-based RL algorithms have achieved significant progress in sample efficiency and the complexity of the tasks they can solve, yet remain under-utilized in the fine-tuning setting. In this work, we argue that existing methods for high-dimensional model-based offline RL are not suitable for offline-to-online fine-tuning due to issues with distribution shifts, off-dynamics data, and non-stationary rewards. We propose an on-policy model-based method that can efficiently reuse prior data through model-based value expansion and policy regularization, while preventing model exploitation by controlling epistemic uncertainty. We find that our approach successfully solves tasks from the MetaWorld benchmark, as well as the Franka Kitchen robot manipulation environment completely from images. To our knowledge, MOTO is the first and only method to solve this environment from pixels.", "keywords": "offline RL;online fine-tuning;model-learning;robot learning", "primary_area": "", "supplementary_material": "/attachment/f2c628412cef4ef2fd4afee68ea57cf0ce7448ea.zip", "author": "Rafael Rafailov;Kyle Beltran Hatch;Victor Kolev;John D Martin;Mariano Phielipp;Chelsea Finn", "authorids": "~Rafael_Rafailov1;~Kyle_Beltran_Hatch1;~Victor_Kolev1;~John_D_Martin1;~Mariano_Phielipp2;~Chelsea_Finn1", "gender": "M;M;M;M;M;F", "homepage": "https://rmrafailov.github.io/;https://khatch31.github.io/;https://victorkolev.github.io;https://jdmartin86.github.io;https://www.intel.com/content/www/us/en/research/researchers/mariano-phielipp.html;https://ai.stanford.edu/~cbfinn/", "dblp": "272/5358;;;241/6120.html;23/4518;131/1783", "google_scholar": "TwABcRgAAAAJ;;;Jun8c34AAAAJ;YArRsvEAAAAJ;vfPE6hgAAAAJ", "orcid": ";;;;;", "linkedin": ";kyle-h-3402a792/;;;mariano-phielipp-941624;", "or_profile": "~Rafael_Rafailov1;~Kyle_Beltran_Hatch1;~Victor_Kolev1;~John_D_Martin1;~Mariano_Phielipp2;~Chelsea_Finn1", "aff": "Stanford University;Toyota Research Institute;Stanford University;Intel;Intel Labs;Google", "aff_domain": "stanford.edu;tri.global;stanford.edu;intel.com;intel.com;google.com", "position": "PhD student;Researcher;Undergrad student;Researcher;Principal Researcher;Research Scientist", "bibtex": "@inproceedings{\nrafailov2023moto,\ntitle={{MOTO}: Offline Pre-training to Online Fine-tuning for Model-based Robot Learning},\nauthor={Rafael Rafailov and Kyle Beltran Hatch and Victor Kolev and John D Martin and Mariano Phielipp and Chelsea Finn},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=6Um8P8Fvyhl}\n}", "github": "", "project": "", "reviewers": "KHbi;ZAHx;Ar1n;kYGq", "site": "https://openreview.net/forum?id=6Um8P8Fvyhl", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;3;3", "rating_avg": 5.5, "confidence_avg": 3.25, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3361806679040928692&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;2;2;3", "aff_unique_norm": "Stanford University;Toyota Research Institute;Intel;Google", "aff_unique_dep": ";;Intel Corporation;Google", "aff_unique_url": "https://www.stanford.edu;https://www.tri.global;https://www.intel.com;https://www.google.com", "aff_unique_abbr": "Stanford;TRI;Intel;Google", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "6a4sECAMCA", "title": "Learning to Drive Anywhere", "track": "main", "status": "Poster", "tldr": "", "abstract": "Human drivers can seamlessly adapt their driving decisions across geographical locations with diverse conditions and rules of the road, e.g., left vs. right-hand traffic. In contrast, existing models for autonomous driving have been thus far only deployed within restricted operational domains, \\ie, without accounting for varying driving behaviors across locations or model scalability. In this work, we propose GeCo, a single geographically-aware conditional imitation learning (CIL) model that can efficiently learn from heterogeneous and globally distributed data with dynamic environmental, traffic, and social characteristics. Our key insight is to introduce a high-capacity, geo-location-based channel attention mechanism that effectively adapts to local nuances while also flexibly modeling similarities among regions in a data-driven manner. By optimizing a contrastive imitation objective, our proposed approach can efficiently scale across the inherently imbalanced data distributions and location-dependent events. We demonstrate the benefits of our GeCo agent across multiple datasets, cities, and scalable deployment paradigms, \\ie, centralized, semi-supervised, and distributed agent training. Specifically, GeCo outperforms CIL baselines by over 14% in open-loop evaluation and 30% in closed-loop testing on CARLA.", "keywords": "Global-scale Autonomous Driving;Imitation Learning;Transformer", "primary_area": "", "supplementary_material": "/attachment/62e5c92ebd35dd658fccd1b085b4b1afcaf97b5d.zip", "author": "Ruizhao Zhu;Peng Huang;Eshed Ohn-Bar;Venkatesh Saligrama", "authorids": "~Ruizhao_Zhu1;~Peng_Huang8;~Eshed_Ohn-Bar4;~Venkatesh_Saligrama1", "gender": "M;;Not Specified;", "homepage": "https://ruizhaoz.github.io;;https://eshed1.github.io/;https://venkatesh-saligrama.github.io/", "dblp": "285/9764;;121/0305;67/4721", "google_scholar": "otVAkGkAAAAJ;g06wPJwAAAAJ;p9zVBV4AAAAJ;S4z3uzMAAAAJ", "orcid": ";;;0000-0002-0675-2268", "linkedin": ";;;venkatesh-saligrama-91175a16/", "or_profile": "~Ruizhao_Zhu1;~Peng_Huang8;~Eshed_Ohn-Bar4;~Venkatesh_Saligrama1", "aff": "Boston University;Boston University;Boston University;Boston University", "aff_domain": "bu.edu;bu.edu;bu.edu;bu.edu", "position": "PhD student;MS student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nzhu2023learning,\ntitle={Learning to Drive Anywhere},\nauthor={Ruizhao Zhu and Peng Huang and Eshed Ohn-Bar and Venkatesh Saligrama},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=6a4sECAMCA}\n}", "github": "", "project": "", "reviewers": "bBuC;tmmM;pg55;etZf;zjyo", "site": "https://openreview.net/forum?id=6a4sECAMCA", "pdf_size": 0, "rating": "4;6;6;6;6", "confidence": "4;4;4;5;3", "rating_avg": 5.6, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2893940812275100171&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Boston University", "aff_unique_dep": "", "aff_unique_url": "https://www.bu.edu", "aff_unique_abbr": "BU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "6kSohKYYTn0", "title": "Measuring Interpretability of Neural Policies of Robots with Disentangled Representation", "track": "main", "status": "Oral", "tldr": "Investigate interpretability of compact neural policies with disentanglement and decision tree", "abstract": "The advancement of robots, particularly those functioning in complex human-centric environments, relies on control solutions that are driven by machine learning. Understanding how learning-based controllers make decisions is crucial since robots are mostly safety-critical systems. This urges a formal and quantitative understanding of the explanatory factors in the interpretability of robot learning. In this paper, we aim to study interpretability of compact neural policies through the lens of disentangled representation. We leverage decision trees to obtain factors of variation [1] for disentanglement in robot learning; these encapsulate skills, behaviors, or strategies toward solving tasks. To assess how well networks uncover the underlying task dynamics, we introduce interpretability metrics that measure disentanglement of learned neural dynamics from a concentration of decisions, mutual information and modularity perspective. We showcase the effectiveness of the connection between interpretability and disentanglement consistently across extensive experimental analysis.", "keywords": "Interpretability;Disentangled Representation;Neural Policy", "primary_area": "", "supplementary_material": "/attachment/79a758507f8f6871cf2345eec8ce5fc6d3015331.zip", "author": "Tsun-Hsuan Wang;Wei Xiao;Tim Seyde;Ramin Hasani;Daniela Rus", "authorids": "~Tsun-Hsuan_Wang2;~Wei_Xiao2;~Tim_Seyde1;~Ramin_Hasani1;~Daniela_Rus1", "gender": "M;M;;F;M", "homepage": "https://zswang666.github.io/;;;https://www.csail.mit.edu/person/daniela-rus;http://www.raminhasani.com", "dblp": "217/1809.html;20/4794-3;226/6408;r/DanielaRus;190/3168", "google_scholar": "xE3WSuYAAAAJ;BxdZJNQAAAAJ;FJ7ILzkAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.at/citations?user=YarJF3QAAAAJ", "orcid": ";;;;0000-0002-9889-5222", "linkedin": ";;;;raminhasani/", "or_profile": "~Tsun-Hsuan_Wang2;~Wei_Xiao2;~Tim_Seyde1;~Daniela_Rus1;~Ramin_M._Hasani1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;Student;Full Professor;Researcher", "bibtex": "@inproceedings{\nwang2023measuring,\ntitle={Measuring Interpretability of Neural Policies of Robots with Disentangled Representation},\nauthor={Tsun-Hsuan Wang and Wei Xiao and Tim Seyde and Ramin Hasani and Daniela Rus},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=6kSohKYYTn0}\n}", "github": "https://github.com/zswang666/interpret-by-disentangle", "project": "", "reviewers": "tXoG;75ky;BCVj;Z6qk", "site": "https://openreview.net/forum?id=6kSohKYYTn0", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "2;3;4;2", "rating_avg": 7.0, "confidence_avg": 2.75, "replies_avg": 23, "authors#_avg": 5, "corr_rating_confidence": -0.5222329678670935, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13470278250999869618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "6zGpfOBImD", "title": "M2T2: Multi-Task Masked Transformer for Object-centric Pick and Place", "track": "main", "status": "Poster", "tldr": "M2T2 (Multi-Task Masked Transformer) is a unified network architecture for predicting different types of action primitives.", "abstract": "With the advent of large language models and large-scale robotic datasets, there has been tremendous progress in high-level decision-making for object manipulation. These generic models are able to interpret complex tasks using language commands, but they often have difficulties generalizing to out-of-distribution objects due to the inability of low-level action primitives. In contrast, existing task-specific models excel in low-level manipulation of unknown objects, but only work for a single type of action. To bridge this gap, we present M2T2, a single model that supplies different types of low-level actions that work robustly on arbitrary objects in cluttered scenes. M2T2 is a transformer model which reasons about contact points and predicts valid gripper poses for different action modes given a raw point cloud of the scene. Trained on a large-scale synthetic dataset with 128K scenes, M2T2 achieves zero-shot sim2real transfer on the real robot, outperforming the baseline system with state-of-the-art task-specific models by about 19% in overall performance and 37.5% in challenging scenes were the object needs to be re-oriented for collision-free placement. M2T2 also achieves state-of-the-art results on a subset of language conditioned tasks in RLBench. Videos of robot experiments on unseen objects in both real world and simulation are available at m2-t2.github.io.", "keywords": "Object manipulation;Multi-task learning;Pick and place", "primary_area": "", "supplementary_material": "/attachment/f1f07ec0d74ccea3750feca4b4194ce580417c42.zip", "author": "Wentao Yuan;Adithyavairavan Murali;Arsalan Mousavian;Dieter Fox", "authorids": "~Wentao_Yuan1;~Adithyavairavan_Murali2;~Arsalan_Mousavian1;~Dieter_Fox1", "gender": "M;M;M;M", "homepage": "https://wentaoyuan.github.io;http://adithyamurali.com;https://cs.gmu.edu/~amousavi/;https://homes.cs.washington.edu/~fox/", "dblp": "225/4795.html;;164/8572;f/DieterFox", "google_scholar": "PZZZG6YAAAAJ;Tjj8TZAAAAAJ;fcA9m88AAAAJ;DqXsbPAAAAAJ", "orcid": "0000-0002-3836-8877;;;", "linkedin": ";adithyamurali;;", "or_profile": "~Wentao_Yuan1;~Adithyavairavan_Murali2;~Arsalan_Mousavian1;~Dieter_Fox1", "aff": "University of Washington, Seattle;;NVIDIA;Department of Computer Science", "aff_domain": "uw.edu;;nvidia.com;cs.washington.edu", "position": "PhD student;;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nyuan2023mt,\ntitle={M2T2: Multi-Task Masked Transformer for Object-centric Pick and Place},\nauthor={Wentao Yuan and Adithyavairavan Murali and Arsalan Mousavian and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=6zGpfOBImD}\n}", "github": "https://m2-t2.github.io", "project": "", "reviewers": "bMyz;whqu;dMwG;Bsjg", "site": "https://openreview.net/forum?id=6zGpfOBImD", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;4;3", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=736426184769083738&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Washington;NVIDIA;Unknown Institution", "aff_unique_dep": ";NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.washington.edu;https://www.nvidia.com;", "aff_unique_abbr": "UW;NVIDIA;", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "id": "770xKAHeFS", "title": "How to Learn and Generalize From Three Minutes of Data: Physics-Constrained and Uncertainty-Aware Neural Stochastic Differential Equations", "track": "main", "status": "Oral", "tldr": "We use neural SDEs to learn uncertainty-aware, data-driven models that leverage a priori physics knowledge.", "abstract": "We present a framework and algorithms to learn controlled dynamics models using neural stochastic differential equations (SDEs)---SDEs whose drift and diffusion terms are both parametrized by neural networks. We construct the drift term to leverage a priori physics knowledge as inductive bias, and we design the diffusion term to represent a distance-aware estimate of the uncertainty in the learned model's predictions---it matches the system's underlying stochasticity when evaluated on states near those from the training dataset, and it predicts highly stochastic dynamics when evaluated on states beyond the training regime. The proposed neural SDEs can be evaluated quickly enough for use in model predictive control algorithms, or they can be used as simulators for model-based reinforcement learning. Furthermore, they make accurate predictions over long time horizons, even when trained on small datasets that cover limited regions of the state space. We demonstrate these capabilities through experiments on simulated robotic systems, as well as by using them to model and control a hexacopter's flight dynamics: A neural SDE trained using only three minutes of manually collected flight data results in a model-based control policy that accurately tracks aggressive trajectories that push the hexacopter's velocity and Euler angles to nearly double the maximum values observed in the training dataset.", "keywords": "Neural SDE;Physics-Informed Learning;Data-Driven Modeling;Dynamical Systems;Control;Model-Based Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/acbe87830fcb07b8ec802a0078757b67243a3cfd.zip", "author": "Franck Djeumou;Cyrus Neary;ufuk topcu", "authorids": "~Franck_Djeumou1;~Cyrus_Neary1;~ufuk_topcu1", "gender": "M;Unspecified;M", "homepage": "https://www.cyrusneary.com/;https://autonomy.oden.utexas.edu/;", "dblp": "269/9716.html;12/6659.html;", "google_scholar": "z4JrPP0AAAAJ;jeNGFfQAAAAJ;5YBGZWcAAAAJ", "orcid": "0000-0002-5293-5663;0000-0003-0819-9985;", "linkedin": ";;franck-djeumou-626613202", "or_profile": "~Cyrus_Neary1;~ufuk_topcu1;~Franck_B_Djeumou1", "aff": "The University of Texas at Austin;University of Texas, Austin;University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\ndjeumou2023how,\ntitle={How to Learn and Generalize From Three Minutes of Data: Physics-Constrained and Uncertainty-Aware Neural Stochastic Differential Equations},\nauthor={Franck Djeumou and Cyrus Neary and ufuk topcu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=770xKAHeFS}\n}", "github": "https://github.com/wuwushrek/sde4mbrl", "project": "", "reviewers": "BxRN;5AP6;S722;DHcC", "site": "https://openreview.net/forum?id=770xKAHeFS", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;3;5", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 3, "corr_rating_confidence": 0.816496580927726, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3618913633656784201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "7CtUcT_OHmC", "title": "Learning Human Contribution Preferences in Collaborative Human-Robot Tasks", "track": "main", "status": "Poster", "tldr": "We propose an interaction paradigm and approach for learning human contribution constraints in a collaborative human-robot task, where successful teams maximize task objectives while adhering to human and robot constraints.", "abstract": "In human-robot collaboration, both human and robotic agents must work together to achieve a set of shared objectives. However, each team member may have individual preferences, or constraints, for how they would like to contribute to the task. Effective teams align their actions to optimize task performance while satisfying each team member's constraints to the greatest extent possible. We propose a framework for representing human and robot contribution constraints in collaborative human-robot tasks. Additionally, we present an approach for learning a human partner's contribution constraint online during a collaborative interaction. We evaluate our approach using a variety of simulated human partners in a collaborative decluttering task. Our results demonstrate that our method improves team performance over baselines with some, but not all, simulated human partners. Furthermore, we conducted a pilot user study to gather preliminary insights into the effectiveness of our approach on task performance and collaborative fluency. Preliminary results suggest that pilot users performed fluently with our method, motivating further investigation into considering preferences that emerge from collaborative interactions.", "keywords": "human-robot collaboration;reward learning;human-robot interaction", "primary_area": "", "supplementary_material": "/attachment/3865602519cc6389415d23f38f1b99fc6b730cb3.zip", "author": "Michelle D Zhao;Reid Simmons;Henny Admoni", "authorids": "~Michelle_D_Zhao1;~Reid_Simmons1;~Henny_Admoni1", "gender": "F;M;", "homepage": "https://mzhao98.github.io/;https://www.cs.cmu.edu/~reids;https://hennyadmoni.com", "dblp": ";;44/7075", "google_scholar": "Gu4eXZwAAAAJ;;XXiZaA4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Michelle_D_Zhao1;~Reid_Simmons1;~Henny_Admoni1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhao2023learning,\ntitle={Learning Human Contribution Preferences in Collaborative Human-Robot Tasks},\nauthor={Michelle D Zhao and Reid Simmons and Henny Admoni},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=7CtUcT_OHmC}\n}", "github": "", "project": "", "reviewers": "cJgK;92c8;TiNh;Fwxy", "site": "https://openreview.net/forum?id=7CtUcT_OHmC", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;3;4;3", "rating_avg": 5.5, "confidence_avg": 3.25, "replies_avg": 7, "authors#_avg": 3, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12720078456977943940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "7Pkzm2FgUmq", "title": "SLAP: Spatial-Language Attention Policies", "track": "main", "status": "Poster", "tldr": "We propose SLAP, a way of training vision+language policies which work for mobile robots and trains with very little data.", "abstract": "Despite great strides in language-guided manipulation, existing work has been constrained to table-top settings. Table-tops allow for perfect and consistent camera angles, properties are that do not hold in mobile manipulation. Task plans that involve moving around the environment must be robust to egocentric views and changes in the plane and angle of grasp. A further challenge is ensuring this is all true while still being able to learn skills efficiently from limited data. We propose Spatial-Language Attention Policies (SLAP) as a solution. SLAP uses three-dimensional tokens as the input representation to train a single multi-task, language-conditioned action prediction policy. Our method shows an 80% success rate in the real world across eight tasks with a single model, and a 47.5% success rate when unseen clutter and unseen object configurations are introduced, even with only a handful of examples per task. This represents an improvement of 30% over prior work (20% given unseen distractors and configurations). We see a 4x improvement over baseline in mobile manipulation setting. In addition, we show how SLAPs robustness allows us to execute Task Plans from open-vocabulary instructions using a large language model for multi-step mobile manipulation. For videos, see the website: https://robotslap.github.io", "keywords": "learning from demonstration;language-based robotics", "primary_area": "", "supplementary_material": "/attachment/b7278bfd7251b8733ccd348f4251cad87f8a013f.zip", "author": "Priyam Parashar;Vidhi Jain;Xiaohan Zhang;Jay Vakil;Sam Powers;Yonatan Bisk;Chris Paxton", "authorids": "~Priyam_Parashar1;~Vidhi_Jain2;~Xiaohan_Zhang7;~Jay_Vakil1;~Sam_Powers1;~Yonatan_Bisk1;~Chris_Paxton1", "gender": "F;F;M;M;;M;M", "homepage": "http://acsweb.ucsd.edu/~pparasha/;http://vidhijain.github.io;https://keke-220.github.io/;https://jdvakil.github.io;https://www.ri.cmu.edu/ri-people/samantha-powers/;http://www.YonatanBisk.com;https://cpaxton.github.io/", "dblp": "177/1873;199/2574;;345/8174;;38/9282;", "google_scholar": "XaD54D8AAAAJ;;uWfcPkkAAAAJ;https://scholar.google.com/citations?hl=en;;bWoGh8UAAAAJ;I1mOQpAAAAAJ", "orcid": ";;;;;0000-0002-2111-9081;", "linkedin": ";vidhijain96/;;jdvakil;;yonatanbisk/;", "or_profile": "~Priyam_Parashar1;~Vidhi_Jain2;~Xiaohan_Zhang7;~Jay_Vakil1;~Sam_Powers1;~Yonatan_Bisk1;~Chris_Paxton1", "aff": "Meta Facebook;Google;State University of New York at Binghamton;Meta AI ;Carnegie Mellon University;Meta;Meta Platforms", "aff_domain": "fb.com;google.com;binghamton.edu;meta.com;cmu.edu;meta.com;meta.com", "position": "Researcher;Student Researcher;PhD student;Researcher;PhD student;Visiting Professor;Researcher", "bibtex": "@inproceedings{\nparashar2023slap,\ntitle={{SLAP}: Spatial-Language Attention Policies},\nauthor={Priyam Parashar and Vidhi Jain and Xiaohan Zhang and Jay Vakil and Sam Powers and Yonatan Bisk and Chris Paxton},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=7Pkzm2FgUmq}\n}", "github": "https://github.com/facebookresearch/home-robot/tree/main/projects/slap_manipulation", "project": "", "reviewers": "1D6s;91vF;zJZk;iD6X", "site": "https://openreview.net/forum?id=7Pkzm2FgUmq", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;4;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7585715163398209665&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0;3;0;0", "aff_unique_norm": "Meta;Google;State University of New York at Binghamton;Carnegie Mellon University", "aff_unique_dep": "Meta Platforms, Inc.;Google;;", "aff_unique_url": "https://meta.com;https://www.google.com;https://www.binghamton.edu;https://www.cmu.edu", "aff_unique_abbr": "Meta;Google;SUNY Binghamton;CMU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Mountain View;Binghamton", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7TYeO2XVqI", "title": "SayTap: Language to Quadrupedal Locomotion", "track": "main", "status": "Poster", "tldr": "We propose to use foot contact pattern as an interface to bridge human commands in natural language and low-level commands", "abstract": "Large language models (LLMs) have demonstrated the potential to perform high-level planning. Yet, it remains a challenge for LLMs to comprehend low-level commands, such as joint angle targets or motor torques. This paper proposes an approach to use foot contact patterns as an interface that bridges human commands in natural language and a locomotion controller that outputs these low-level commands. This results in an interactive system for quadrupedal robots that allows the users to craft diverse locomotion behaviors flexibly. We contribute an LLM prompt design, a reward function, and a method to expose the controller to the feasible distribution of contact patterns. The results are a controller capable of achieving diverse locomotion patterns that can be transferred to real robot hardware. Compared with other design choices, the proposed approach enjoys more than 50% success rate in predicting the correct contact patterns and can solve 10 more tasks out of a total of 30 tasks. (\\url{https://saytap.github.io})", "keywords": "Large language model (LLM);Quadrupedal robots;Locomotion", "primary_area": "", "supplementary_material": "/attachment/e0b018cce8448ac92835fb79b977dc2ef3209e5f.zip", "author": "Yujin Tang;Wenhao Yu;Jie Tan;Heiga Zen;Aleksandra Faust;Tatsuya Harada", "authorids": "~Yujin_Tang1;~Wenhao_Yu1;~Jie_Tan1;~Heiga_Zen1;~Aleksandra_Faust1;~Tatsuya_Harada1", "gender": "M;M;M;M;F;M", "homepage": ";https://wenhaoyu.weebly.com/;http://www.jie-tan.net;https://research.google/people/heigazen;http://www.afaust.info;https://www.mi.t.u-tokyo.ac.jp/harada/", "dblp": "190/1177.html;;81/7419;42/7014;135/8420;14/5849", "google_scholar": "https://scholar.google.co.jp/citations?user=3czUzRYAAAAJ;1bF2s2kAAAAJ;neGbgzYAAAAJ;z3IRvDwAAAAJ;RK72t68AAAAJ;https://scholar.google.com/citations?hl=ja", "orcid": ";;;0000-0002-8959-5471;0000-0002-3268-8685;", "linkedin": ";;jie-tan/;heiga-zen-b1a64b3;aleksandrafaust;", "or_profile": "~Yujin_Tang1;~Wenhao_Yu1;~Jie_Tan1;~Heiga_Zen1;~Aleksandra_Faust1;~Tatsuya_Harada1", "aff": "Google;Google;Google;Google;Google Brain;The University of Tokyo", "aff_domain": "google.com;google.com;google.com;google.com;google.com;u-tokyo.ac.jp", "position": "Researcher;Software Engineer;Research Scientist;Researcher;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\ntang2023saytap,\ntitle={SayTap: Language to Quadrupedal Locomotion},\nauthor={Yujin Tang and Wenhao Yu and Jie Tan and Heiga Zen and Aleksandra Faust and Tatsuya Harada},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=7TYeO2XVqI}\n}", "github": "", "project": "", "reviewers": "erUF;Ztd6;8oYu;NnfC", "site": "https://openreview.net/forum?id=7TYeO2XVqI", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;5;4;4", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13823207030350397172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Google;University of Tokyo", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.u-tokyo.ac.jp", "aff_unique_abbr": "Google;UTokyo", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;1", "aff_country_unique": "United States;Japan" }, { "id": "86aMPJn6hX9F", "title": "Stabilize to Act: Learning to Coordinate for Bimanual Manipulation", "track": "main", "status": "Oral", "tldr": "We present a system for bimanual manipulation that coordinates by assigning roles to arms: a stabilizing arm holds an object stationary while an acting arm acts in this simplified environment.", "abstract": "Key to rich, dexterous manipulation in the real world is the ability to coordinate control across two hands. However, while the promise afforded by bimanual robotic systems is immense, constructing control policies for dual arm autonomous systems brings inherent difficulties. One such difficulty is the high-dimensionality of the bimanual action space, which adds complexity to both model-based and data-driven methods. We counteract this challenge by drawing inspiration from humans to propose a novel role assignment framework: a stabilizing arm holds an object in place to simplify the environment while an acting arm executes the task. We instantiate this framework with BimanUal Dexterity from Stabilization (BUDS), which uses a learned restabilizing classifier to alternate between updating a learned stabilization position to keep the environment unchanged, and accomplishing the task with an acting policy learned from demonstrations.\nWe evaluate BUDS on four bimanual tasks of varying complexities on real-world robots, such as zipping jackets and cutting vegetables. \nGiven only 20 demonstrations, BUDS achieves 76.9% task success across our task suite, and generalizes to out-of-distribution objects within a class with a 52.7% success rate. BUDS is 56.0% more successful than an unstructured baseline that instead learns a BC stabilizing policy due to the precision required of these complex tasks. Supplementary material and videos can be found at https://tinyurl.com/stabilizetoact.", "keywords": "Bimanual Manipulation;Learning from Demonstrations;Deformable Object Manipulation", "primary_area": "", "supplementary_material": "/attachment/ce94fd44d17c8fec407a22da5dd48c59719d9526.zip", "author": "Jennifer Grannen;Yilin Wu;Brandon Vu;Dorsa Sadigh", "authorids": "~Jennifer_Grannen1;~Yilin_Wu1;~Brandon_Vu1;~Dorsa_Sadigh1", "gender": ";F;;F", "homepage": "https://jenngrannen.com;http://cs.cmu.edu/~yilinwu;https://brandonvu.super.site/;https://dorsa.fyi/", "dblp": ";66/3299.html;;117/3174", "google_scholar": "O5wWFpIAAAAJ;lyG0vMQAAAAJ;;ZaJEZpYAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jennifer_Grannen1;~Yilin_Wu1;~Brandon_Vu1;~Dorsa_Sadigh1", "aff": "Computer Science Department, Stanford University;Stanford University;Computer Science Department, Stanford University;Stanford University", "aff_domain": "cs.stanford.edu;stanford.edu;cs.stanford.edu;stanford.edu", "position": "PhD student;MS student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\ngrannen2023stabilize,\ntitle={Stabilize to Act: Learning to Coordinate for Bimanual Manipulation},\nauthor={Jennifer Grannen and Yilin Wu and Brandon Vu and Dorsa Sadigh},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=86aMPJn6hX9F}\n}", "github": "", "project": "", "reviewers": "u9KV;ku5C;bbdq;13Nc", "site": "https://openreview.net/forum?id=86aMPJn6hX9F", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "3;4;3;4", "rating_avg": 8.0, "confidence_avg": 3.5, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3196382265590263877&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "8L6pHd9aS6w", "title": "XSkill: Cross Embodiment Skill Discovery", "track": "main", "status": "Poster", "tldr": "We present XSkill, an imitation learning framework that can discover, transfer and compose skills from cross-embodiment demonstration.", "abstract": "Human demonstration videos are a widely available data source for robot learning and an intuitive user interface for expressing desired behavior. However, directly extracting reusable robot manipulation skills from unstructured human videos is challenging due to the big embodiment difference and unobserved action parameters. To bridge this embodiment gap, this paper introduces XSkill, an imitation learning framework that 1) discovers a cross-embodiment representation called skill prototypes purely from unlabeled human and robot manipulation videos, 2) transfers the skill representation to robot actions using conditional diffusion policy, and finally, 3) composes the learned skill to accomplish unseen tasks specified by a human prompt video. Our experiments in simulation and real-world environments show that the discovered skill prototypes facilitate both skill transfer and composition for unseen tasks, resulting in a more general and scalable imitation learning framework.", "keywords": "Manipulation;Representation Learning;Cross-Embodiements", "primary_area": "", "supplementary_material": "/attachment/918f0d5ee506ee08a12c821ed337e3cd2d9f1aae.zip", "author": "Mengda Xu;Zhenjia Xu;Cheng Chi;Manuela Veloso;Shuran Song", "authorids": "~Mengda_Xu1;~Zhenjia_Xu1;~Cheng_Chi4;~Manuela_Veloso1;~Shuran_Song3", "gender": "M;M;M;F;F", "homepage": "https://mengdaxu.github.io/;https://www.zhenjiaxu.com/;https://cheng-chi.github.io/;https://www.cs.cmu.edu/~mmv/;https://shurans.github.io/", "dblp": ";238/0000;;v/ManuelaMVeloso;", "google_scholar": "https://scholar.google.com/citations?hl=en;QE8cLMEAAAAJ;EO0PHdAAAAAJ;https://scholar.google.com.tw/citations?user=2FbkAzYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": "mengda-xu-132b57135/;;;;", "or_profile": "~Mengda_Xu1;~Zhenjia_Xu1;~Cheng_Chi4;~Manuela_Veloso1;~Shuran_Song3", "aff": "Columbia University;Columbia University;Columbia University;School of Computer Science, Carnegie Mellon University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu;cs.cmu.edu;cs.columbia.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2023xskill,\ntitle={{XS}kill: Cross Embodiment Skill Discovery},\nauthor={Mengda Xu and Zhenjia Xu and Cheng Chi and Manuela Veloso and Shuran Song},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=8L6pHd9aS6w}\n}", "github": "https://xskill.cs.columbia.edu/", "project": "", "reviewers": "9Eu4;1ysS;fc5Y", "site": "https://openreview.net/forum?id=8L6pHd9aS6w", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;3;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 70, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3151439163863750310&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Columbia University;Carnegie Mellon University", "aff_unique_dep": ";School of Computer Science", "aff_unique_url": "https://www.columbia.edu;https://www.cmu.edu", "aff_unique_abbr": "Columbia;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8asqEWO479I", "title": "Push Past Green: Learning to Look Behind Plant Foliage by Moving It", "track": "main", "status": "Poster", "tldr": "We tackle the plant self-occlusion problem by learning neural networks that predict the outcome of plant interaction actions.", "abstract": "Autonomous agriculture applications (e.g., inspection, phenotyping, plucking fruits) require manipulating the plant foliage to look behind the leaves and the branches. Partial visibility, extreme clutter, thin structures, and unknown geometry and dynamics for plants make such manipulation challenging. We tackle these challenges through data-driven methods. We use self-supervision to train SRPNet, a neural network that predicts what space is revealed on execution of a candidate action on a given plant. We use SRPNet with the cross-entropy method to predict actions that are effective at revealing space beneath plant foliage. Furthermore, as SRPNet does not just predict how much space is revealed but also where it is revealed, we can execute a sequence of actions that incrementally reveal more and more space beneath the plant foliage. We experiment with a synthetic (vines) and a real plant (Dracaena) on a physical test-bed across 5 settings including 2 settings that test generalization to novel plant configurations. Our experiments reveal the effectiveness of our overall method, PPG, over a competitive hand-crafted exploration method, and the effectiveness of SRPNet over a hand-crafted dynamics model and relevant ablations. Project website with execution videos, code, data, and models: https://sites.google.com/view/pushingfoliage/.", "keywords": "Deformable Object Manipulation;Model-building;Self-supervision", "primary_area": "", "supplementary_material": "/attachment/992b9be4863d0413f91f21a33dacd50aaa4c2284.zip", "author": "Xiaoyu Zhang;Saurabh Gupta", "authorids": "~Xiaoyu_Zhang12;~Saurabh_Gupta1", "gender": "F;", "homepage": "https://erinzhang1998.github.io/;http://saurabhg.web.illinois.edu", "dblp": ";06/5843-1", "google_scholar": ";1HO5UacAAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Xiaoyu_Zhang12;~Saurabh_Gupta1", "aff": "Department of Computer Science;University of Illinois, Urbana Champaign", "aff_domain": "cs.illinois.edu;illinois.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023push,\ntitle={Push Past Green: Learning to Look Behind Plant Foliage by Moving It},\nauthor={Xiaoyu Zhang and Saurabh Gupta},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=8asqEWO479I}\n}", "github": "https://github.com/ErinZhang1998/pushpastgreen", "project": "", "reviewers": "raTR;qgE2;N9Cs;23Je", "site": "https://openreview.net/forum?id=8asqEWO479I", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "3;3;4;4", "rating_avg": 7.5, "confidence_avg": 3.5, "replies_avg": 10, "authors#_avg": 2, "corr_rating_confidence": 0.9622504486493761, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11961945359378851609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1", "aff_unique_norm": "Unknown Institution;University of Illinois Urbana-Champaign", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": ";https://illinois.edu", "aff_unique_abbr": ";UIUC", "aff_campus_unique_index": "1", "aff_campus_unique": ";Urbana-Champaign", "aff_country_unique_index": "1", "aff_country_unique": ";United States" }, { "id": "8scj3Y0RLq", "title": "Task Generalization with Stability Guarantees via Elastic Dynamical System Motion Policies", "track": "main", "status": "Poster", "tldr": "We propose a dynamical system based motion policy learning and generalization method with stability guarantees that can adapt to new task configurations without new demonstrations", "abstract": "Dynamical System (DS) based Learning from Demonstration (LfD) allows learning of reactive motion policies with stability and convergence guarantees from a few trajectories. Yet, current DS learning techniques lack the flexibility to generalize to new task instances as they overlook explicit task parameters that inherently change the underlying demonstrated trajectories. In this work, we propose Elastic-DS, a novel DS learning and generalization approach that embeds task parameters into the Gaussian Mixture Model (GMM) based Linear Parameter Varying (LPV) DS formulation. Central to our approach is the Elastic-GMM, a GMM constrained to SE(3) task-relevant frames. Given a new task instance/context, the Elastic-GMM is transformed with Laplacian Editing and used to re-estimate the LPV-DS policy. Elastic-DS is compositional in nature and can be used to construct flexible multi-step tasks. We showcase its strength on a myriad of simulated and real-robot experiments while preserving desirable control-theoretic guarantees.", "keywords": "Stable Dynamical Systems;Reactive Motion Policies;Learning from Demonstrations;Task Parametrization;Task Generalization", "primary_area": "", "supplementary_material": "/attachment/4d06fb00ef8fd23f3c660317a4c432d9e24cbdad.zip", "author": "Tianyu Li;Nadia Figueroa", "authorids": "~Tianyu_Li9;~Nadia_Figueroa1", "gender": "M;F", "homepage": "http://imtianyuli.com/;https://nbfigueroa.github.io/", "dblp": ";116/8822", "google_scholar": ";1NQRXHQAAAAJ", "orcid": ";0000-0002-6873-4671", "linkedin": ";nadiabarbara/", "or_profile": "~Tianyu_Li9;~Nadia_Figueroa1", "aff": "University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;seas.upenn.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nli2023task,\ntitle={Task Generalization with Stability Guarantees via Elastic Dynamical System Motion Policies},\nauthor={Tianyu Li and Nadia Figueroa},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=8scj3Y0RLq}\n}", "github": "", "project": "", "reviewers": "KgCv;uBxH;4LLc", "site": "https://openreview.net/forum?id=8scj3Y0RLq", "pdf_size": 0, "rating": "4;10;10", "confidence": "2;4;4", "rating_avg": 8.0, "confidence_avg": 3.3333333333333335, "replies_avg": 13, "authors#_avg": 2, "corr_rating_confidence": 1.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2866117373890839548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "8yTS_nAILxt", "title": "REFLECT: Summarizing Robot Experiences for Failure Explanation and Correction", "track": "main", "status": "Poster", "tldr": "We propose REFLECT, a framework that leverages Large Language Models (LLMs) for robot failure explanation and correction, based on a hierarchical summary of robot past experiences generated from multisensory data.", "abstract": "The ability to detect and analyze failed executions automatically is crucial for an explainable and robust robotic system. Recently, Large Language Models (LLMs) have demonstrated strong reasoning abilities on textual inputs. To leverage the power of LLMs for robot failure explanation, we introduce REFLECT, a framework which queries LLM for failure reasoning based on a hierarchical summary of robot past experiences generated from multisensory observations. The failure explanation can further guide a language-based planner to correct the failure and complete the task. To systematically evaluate the framework, we create the RoboFail dataset with a variety of tasks and failure scenarios. We demonstrate that the LLM-based framework is able to generate informative failure explanations that assist successful correction planning.", "keywords": "Large Language Model;Explainable AI;Task Planning", "primary_area": "", "supplementary_material": "/attachment/307057542ed2a72ea413e4d2bfc82de5063a6191.zip", "author": "Zeyi Liu;Arpit Bahety;Shuran Song", "authorids": "~Zeyi_Liu1;~Arpit_Bahety1;~Shuran_Song3", "gender": "F;M;F", "homepage": "https://lzylucy.github.io;https://arpitrf.github.io/;https://shurans.github.io/", "dblp": ";251/5918;", "google_scholar": ";g6k9tcMAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": "zeyi-liu;;", "or_profile": "~Zeyi_Liu1;~Arpit_Bahety1;~Shuran_Song3", "aff": "Columbia University;Columbia University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;cs.columbia.edu", "position": "MS student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nliu2023reflect,\ntitle={{REFLECT}: Summarizing Robot Experiences for Failure Explanation and Correction},\nauthor={Zeyi Liu and Arpit Bahety and Shuran Song},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=8yTS_nAILxt}\n}", "github": "https://github.com/real-stanford/reflect", "project": "", "reviewers": "BfQU;GeFi;6L3t;LGvx", "site": "https://openreview.net/forum?id=8yTS_nAILxt", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;4;3;3", "rating_avg": 6.5, "confidence_avg": 3.75, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.7608859102526822, "gs_citation": 136, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13568943050059915533&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Columbia University", "aff_unique_dep": "", "aff_unique_url": "https://www.columbia.edu", "aff_unique_abbr": "Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "93qz1k6_6h", "title": "Dexterous Functional Grasping", "track": "main", "status": "Poster", "tldr": "We show a generalizable approach to dexterous functional grasping using sim2real", "abstract": "While there have been significant strides in dexterous manipulation, most of it is limited to benchmark tasks like in-hand reorientation which are of limited utility in the real world. The main benefit of dexterous hands over two-fingered ones is their ability to pickup tools and other objects (including thin ones) and grasp them firmly in order to apply force. However, this task requires both a complex understanding of functional affordances as well as precise low-level control. While prior work obtains affordances from human data this approach doesn't scale to low-level control. Similarly, simulation training cannot give the robot an understanding of real-world semantics. In this paper, we aim to combine the best of both worlds to accomplish functional grasping for in-the-wild objects. We use a modular approach. First, affordances are obtained by matching corresponding regions of different objects and then a low-level policy trained in sim is run to grasp it. We propose a novel application of eigengrasps to reduce the search space of RL using a small amount of human data and find that it leads to more stable and physically realistic motion. We find that eigengrasp action space beats baselines in simulation and outperforms hardcoded grasping in real and matches or outperforms a trained human teleoperator. Videos at https://dexfunc.github.io/.", "keywords": "Functional Grasping;Sim2real", "primary_area": "", "supplementary_material": "", "author": "Ananye Agarwal;Shagun Uppal;Kenneth Shaw;Deepak Pathak", "authorids": "~Ananye_Agarwal1;~Shagun_Uppal1;~Kenneth_Shaw1;~Deepak_Pathak1", "gender": "M;F;M;M", "homepage": "https://anag.me/;;https://www.linkedin.com/in/kenny-shaw/;https://www.cs.cmu.edu/~dpathak/", "dblp": "294/4812;;;155/9860", "google_scholar": "https://scholar.google.com/citations?hl=en;cjo5X3QAAAAJ;;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;;", "linkedin": ";shagunuppal/;kenny-shaw/;pathak22/", "or_profile": "~Ananye_Agarwal1;~Shagun_Uppal1;~Kenneth_Shaw1;~Deepak_Pathak1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;MS student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nagarwal2023dexterous,\ntitle={Dexterous Functional Grasping},\nauthor={Ananye Agarwal and Shagun Uppal and Kenneth Shaw and Deepak Pathak},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=93qz1k6_6h}\n}", "github": "", "project": "", "reviewers": "qb6f;TMM4;Kobm;6xz6", "site": "https://openreview.net/forum?id=93qz1k6_6h", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;4;4", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4777305485406173529&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "9GRE34K0SB", "title": "AdaptSim: Task-Driven Simulation Adaptation for Sim-to-Real Transfer", "track": "main", "status": "Poster", "tldr": "We propose AdaptSim, a new task-driven adaptation framework for sim-to-real transfer that aims to optimize task performance in target (real) environments", "abstract": "Simulation parameter settings such as contact models and object geometry approximations are critical to training robust manipulation policies capable of transferring from simulation to real-world deployment. There is often an irreducible gap between simulation and reality: attempting to match the dynamics between simulation and reality may be infeasible and may not lead to policies that perform well in reality for a specific task. We propose AdaptSim, a new task-driven adaptation framework for sim-to-real transfer that aims to optimize task performance in target (real) environments. First, we meta-learn an adaptation policy in simulation using reinforcement learning for adjusting the simulation parameter distribution based on the current policy's performance in a target environment. We then perform iterative real-world adaptation by inferring new simulation parameter distributions for policy training. Our extensive simulation and hardware experiments demonstrate AdaptSim achieving 1-3x asymptotic performance and 2x real data efficiency when adapting to different environments, compared to methods based on Sys-ID and directly training the task policy in target environments.", "keywords": "Contact-rich manipulation;sim-to-real transfer", "primary_area": "", "supplementary_material": "/attachment/11321e89e1641f7ebf3ca4727b65b00b08058a0a.zip", "author": "Allen Z. Ren;Hongkai Dai;Benjamin Burchfiel;Anirudha Majumdar", "authorids": "~Allen_Z._Ren1;~Hongkai_Dai1;~Benjamin_Burchfiel1;~Anirudha_Majumdar1", "gender": "M;;M;M", "homepage": "http://allenzren.github.io/;;http://www.benburchfiel.com/;https://irom-lab.princeton.edu/majumdar/", "dblp": ";;136/9247;116/6436", "google_scholar": "mgMzkYMAAAAJ;ZZsEXLAAAAAJ;eGoTK1YAAAAJ;ibu3FwsAAAAJ", "orcid": ";;;", "linkedin": "allenzren/;;benburchfiel/;", "or_profile": "~Allen_Z._Ren1;~Hongkai_Dai1;~Benjamin_Burchfiel1;~Anirudha_Majumdar1", "aff": "Google DeepMind;Toyota Research Institute;Dexterous Manipulation Group, Toyota Research Institute;Princeton University", "aff_domain": "google.com;tri.global;tri.global;princeton.edu", "position": "Intern;Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nren2023adaptsim,\ntitle={AdaptSim: Task-Driven Simulation Adaptation for Sim-to-Real Transfer},\nauthor={Allen Z. Ren and Hongkai Dai and Benjamin Burchfiel and Anirudha Majumdar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9GRE34K0SB}\n}", "github": "https://github.com/irom-lab/AdaptSim", "project": "", "reviewers": "FEC8;zQUg;bMZi;XPgu", "site": "https://openreview.net/forum?id=9GRE34K0SB", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;3;3;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4249547302399520913&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Google;Toyota Research Institute;Princeton University", "aff_unique_dep": "Google DeepMind;;", "aff_unique_url": "https://deepmind.com;https://www.tri.global;https://www.princeton.edu", "aff_unique_abbr": "DeepMind;TRI;Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "9SM6l0HyY_", "title": "Learning Generalizable Manipulation Policies with Object-Centric 3D Representations", "track": "main", "status": "Poster", "tldr": "We introduce an imitation learning method for training policies that generalize beyond their demonstration settings.", "abstract": "We introduce GROOT, an imitation learning method for learning robust policies with object-centric and 3D priors. GROOT builds policies that generalize beyond their initial training conditions for vision-based manipulation. It constructs object-centric 3D representations that are robust toward background changes and camera views and reason over these representations using a transformer-based policy. Furthermore, we introduce a segmentation correspondence model that allows policies to generalize to new objects at test time. Through comprehensive experiments, we validate the robustness of GROOT policies against perceptual variations in simulated and real-world environments. GROOT\u2019s performance excels in generalization over background changes, camera viewpoint shifts, and the presence of new object instances, whereas both state-of-the-art end-to-end learning methods and object proposal-based approaches fall short. We also extensively evaluate GROOT policies on real robots, where we demonstrate the efficacy under very wild changes in setup. More videos and model details can be found in the appendix and the project website https://ut-austin-rpl.github.io/GROOT.", "keywords": "robot manipulation;imitation learning;object-centric representations", "primary_area": "", "supplementary_material": "/attachment/a2c4364be35ddf6f95fe8fa00d5e544b97d2cb85.zip", "author": "Yifeng Zhu;Zhenyu Jiang;Peter Stone;Yuke Zhu", "authorids": "~Yifeng_Zhu2;~Zhenyu_Jiang1;~Peter_Stone1;~Yuke_Zhu1", "gender": "M;M;M;M", "homepage": "https://cs.utexas.edu/~yifengz;https://zhenyujiang.me/;http://www.cs.utexas.edu/~pstone;https://cs.utexas.edu/~yukez/", "dblp": ";55/10479-2;s/PeterStone;133/1772", "google_scholar": ";2KLTzZIAAAAJ;qnwjcfAAAAAJ;mWGyYMsAAAAJ", "orcid": ";0000-0002-9711-7461;0000-0002-6795-420X;", "linkedin": ";;;", "or_profile": "~Yifeng_Zhu2;~Zhenyu_Jiang1;~Peter_Stone1;~Yuke_Zhu1", "aff": "The University of Texas at Austin;University of Texas, Austin;University of Texas, Austin;Computer Science Department, University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;cs.utexas.edu", "position": "PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhu2023learning,\ntitle={Learning Generalizable Manipulation Policies with Object-Centric 3D Representations},\nauthor={Yifeng Zhu and Zhenyu Jiang and Peter Stone and Yuke Zhu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9SM6l0HyY_}\n}", "github": "https://github.com/UT-Austin-RPL/GROOT", "project": "", "reviewers": "p4GH;Yp2B;YKj9;ghms", "site": "https://openreview.net/forum?id=9SM6l0HyY_", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13468363876551788063&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "9_8LF30mOC", "title": "VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models", "track": "main", "status": "Oral", "tldr": "", "abstract": "Large language models (LLMs) are shown to possess a wealth of actionable knowledge that can be extracted for robot manipulation in the form of reasoning and planning. Despite the progress, most still rely on pre-defined motion primitives to carry out the physical interactions with the environment, which remains a major bottleneck. In this work, we aim to synthesize robot trajectories, i.e., a dense sequence of 6-DoF end-effector waypoints, for a large variety of manipulation tasks given an open-set of instructions and an open-set of objects. We achieve this by first observing that LLMs excel at inferring affordances and constraints given a free-form language instruction. More importantly, by leveraging their code-writing capabilities, they can interact with a vision-language model (VLM) to compose 3D value maps to ground the knowledge into the observation space of the agent. The composed value maps are then used in a model-based planning framework to zero-shot synthesize closed-loop robot trajectories with robustness to dynamic perturbations. We further demonstrate how the proposed framework can benefit from online experiences by efficiently learning a dynamics model for scenes that involve contact-rich interactions. We present a large-scale study of the proposed method in both simulated and real-robot environments, showcasing the ability to perform a large variety of everyday manipulation tasks specified in free-form natural language.", "keywords": "Manipulation;Large Language Models;Model-based Planning", "primary_area": "", "supplementary_material": "/attachment/b3ad12f8510ed476ce2eda5cf64e2edcb11956b6.zip", "author": "Wenlong Huang;Chen Wang;Ruohan Zhang;Yunzhu Li;Jiajun Wu;Li Fei-Fei", "authorids": "~Wenlong_Huang1;~Chen_Wang16;~Ruohan_Zhang1;~Yunzhu_Li1;~Jiajun_Wu1;~Li_Fei-Fei1", "gender": "M;M;M;M;M;F", "homepage": "https://wenlong.page;http://www.chenwangjeremy.net/;https://ai.stanford.edu/~zharu/;https://yunzhuli.github.io/;https://jiajunwu.com;https://profiles.stanford.edu/fei-fei-li", "dblp": "82/2872;;;182/1831;117/4768;79/2528", "google_scholar": "hYVMrzsAAAAJ;lStkAzsAAAAJ;-bqvNWoAAAAJ;WlA92lcAAAAJ;2efgcS0AAAAJ;rDfyQnIAAAAJ", "orcid": ";;;;0000-0002-4176-343X;", "linkedin": ";;;;jiajunwu/;fei-fei-li-4541247/", "or_profile": "~Wenlong_Huang1;~Chen_Wang16;~Ruohan_Zhang1;~Yunzhu_Li1;~Jiajun_Wu1;~Li_Fei-Fei1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Postdoc;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2023voxposer,\ntitle={VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models},\nauthor={Wenlong Huang and Chen Wang and Ruohan Zhang and Yunzhu Li and Jiajun Wu and Li Fei-Fei},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9_8LF30mOC}\n}", "github": "https://github.com/huangwl18/VoxPoser", "project": "", "reviewers": "NGjD;7joN;tgHR", "site": "https://openreview.net/forum?id=9_8LF30mOC", "pdf_size": 0, "rating": "6;6;10", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 6, "corr_rating_confidence": 0.5, "gs_citation": 564, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7844573100140075704&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "9al6taqfTzr", "title": "Open-World Object Manipulation using Pre-Trained Vision-Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "For robots to follow instructions from people, they must be able to connect the rich semantic information in human vocabulary, e.g. ``can you get me the pink stuffed whale?'' to their sensory observations and actions. This brings up a notably difficult challenge for robots: while robot learning approaches allow robots to learn many different behaviors from first-hand experience, it is impractical for robots to have first-hand experiences that span all of this semantic information. We would like a robot's policy to be able to perceive and pick up the pink stuffed whale, even if it has never seen any data interacting with a stuffed whale before. Fortunately, static data on the internet has vast semantic information, and this information is captured in pre-trained vision-language models. In this paper, we study whether we can interface robot policies with these pre-trained models, with the aim of allowing robots to complete instructions involving object categories that the robot has never seen first-hand. We develop a simple approach, which we call Manipulation of Open-World Objects (MOO), which leverages a pre-trained vision-language model to extract object-identifying information from the language command and image, and conditions the robot policy on the current image, the instruction, and the extracted object information. In a variety of experiments on a real mobile manipulator, we find that MOO generalizes zero-shot to a wide range of novel object categories and environments. In addition, we show how MOO generalizes to other, non-language-based input modalities to specify the object of interest such as finger pointing, and how it can be further extended to enable open-world navigation and manipulation. The project\u2019s website and evaluation videos can be found at https://robot-moo.github.io/.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/10fe099c03117874805a272aa8bfd29ffc8483f8.zip", "author": "Austin Stone;Ted Xiao;Yao Lu;Keerthana Gopalakrishnan;Kuang-Huei Lee;Quan Vuong;Paul Wohlhart;Sean Kirmani;Brianna Zitkovich;Fei Xia;Chelsea Finn;Karol Hausman", "authorids": "~Austin_Stone1;~Ted_Xiao1;~Yao_Lu13;~Keerthana_Gopalakrishnan1;~Kuang-Huei_Lee1;~Quan_Vuong2;~Paul_Wohlhart1;~Sean_Kirmani1;zitkovich@google.com;~Fei_Xia1;~Chelsea_Finn1;~Karol_Hausman2", "gender": ";M;;F;M;M;M;M;;M;F;", "homepage": ";https://www.tedxiao.me;;https://keerthanapg.com;https://kuanghuei.github.io/;https://quanvuong.github.io;;https://kirmani.io/;;;https://ai.stanford.edu/~cbfinn/;", "dblp": "202/1823;198/0598;26/5662-6;;66/11466;;http://dblp.uni-trier.de/pers/hd/w/Wohlhart:Paul;;;;131/1783;", "google_scholar": "IU4ZllQAAAAJ;;OI7zFmwAAAAJ;;rE7-N30AAAAJ;NSWI3OwAAAAJ;SzHPa90AAAAJ;iyEuK8kAAAAJ;;pqP5_PgAAAAJ;vfPE6hgAAAAJ;", "orcid": ";;;;;;;;;0000-0003-4343-1444;;", "linkedin": "austin-charles-stone-1ba33b138/;;;;;;;skirmani;;;;", "or_profile": "~Austin_Stone1;~Ted_Xiao1;~Yao_Lu13;~Keerthana_Gopalakrishnan1;~Kuang-Huei_Lee1;~Quan_Vuong2;~Paul_Wohlhart1;~Sean_Kirmani1;zitkovich@google.com;~Fei_Xia1;~Chelsea_Finn1;~Karol_Hausman2", "aff": "Google;;Google;Research, Google;Google;;Graz University of Technology;Google X;;Google;Google;", "aff_domain": "google.com;;google.com;research.google.com;google.com;; ;x.team;;google.com;google.com;", "position": "Research Engineer;;Researcher;Researcher;Researcher;;Post Doc;Researcher;;Researcher;Research Scientist;", "bibtex": "@inproceedings{\nstone2023openworld,\ntitle={Open-World Object Manipulation using Pre-Trained Vision-Language Models},\nauthor={Austin Stone and Ted Xiao and Yao Lu and Keerthana Gopalakrishnan and Kuang-Huei Lee and Quan Vuong and Paul Wohlhart and Sean Kirmani and Brianna Zitkovich and Fei Xia and Chelsea Finn and Karol Hausman},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9al6taqfTzr}\n}", "github": "", "project": "", "reviewers": "VQYg;6xeG;VUMx;ZXLg", "site": "https://openreview.net/forum?id=9al6taqfTzr", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "5;4;5;5", "rating_avg": 7.0, "confidence_avg": 4.75, "replies_avg": 11, "authors#_avg": 12, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 161, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6038066212688441225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1;0;0;0", "aff_unique_norm": "Google;Graz University of Technology", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.tugraz.at", "aff_unique_abbr": "Google;TUGraz", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;1;0;0;0", "aff_country_unique": "United States;Austria" }, { "id": "9bK38pUBzU", "title": "Language-Conditioned Path Planning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Contact is at the core of robotic manipulation. At times, it is desired (e.g. manipulation and grasping), and at times, it is harmful (e.g. when avoiding obstacles). However, traditional path planning algorithms focus solely on collision-free paths, limiting their applicability in contact-rich tasks. To address this limitation, we propose the domain of Language-Conditioned Path Planning, where contact-awareness is incorporated into the path planning problem. As a first step in this domain, we propose Language-Conditioned Collision Functions (LACO), a novel approach that learns a collision function using only a single-view image, language prompt, and robot configuration. LACO predicts collisions between the robot and the environment, enabling flexible, conditional path planning without the need for manual object annotations, point cloud data, or ground-truth object meshes. In both simulation and the real world, we demonstrate that LACO can facilitate complex, nuanced path plans that allow for interaction with objects that are safe to collide, rather than prohibiting any collision.", "keywords": "Robotic Manipulation;Path Planning;Collision Avoidance;Learned Collision Function", "primary_area": "", "supplementary_material": "/attachment/e3548e99b8dbf1396da6d338ea4f09f1fb5f19b4.zip", "author": "Amber Xie;Youngwoon Lee;Pieter Abbeel;Stephen James", "authorids": "~Amber_Xie1;~Youngwoon_Lee1;~Pieter_Abbeel2;~Stephen_James1", "gender": ";M;M;M", "homepage": ";https://youngwoon.github.io;https://people.eecs.berkeley.edu/~pabbeel/;https://stepjam.github.io/", "dblp": ";117/4767;;163/5669", "google_scholar": "https://scholar.google.com/citations?hl=en;CDPa3AgAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;OXtG-isAAAAJ", "orcid": ";0000-0001-9918-1056;;", "linkedin": ";;;", "or_profile": "~Amber_Xie1;~Youngwoon_Lee1;~Pieter_Abbeel2;~Stephen_James1", "aff": ";University of California, Berkeley;Covariant;Dyson", "aff_domain": ";berkeley.edu;covariant.ai;dyson.com", "position": ";Postdoc;Founder;Principal Researcher", "bibtex": "@inproceedings{\nxie2023languageconditioned,\ntitle={Language-Conditioned Path Planning},\nauthor={Amber Xie and Youngwoon Lee and Pieter Abbeel and Stephen James},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9bK38pUBzU}\n}", "github": "https://github.com/amberxie88/lapp", "project": "", "reviewers": "FRPN;bXSy;CVWw", "site": "https://openreview.net/forum?id=9bK38pUBzU", "pdf_size": 0, "rating": "6;6;10", "confidence": "3;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4647990575555546338&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of California, Berkeley;Covariant;Dyson", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;;https://www.dyson.com", "aff_unique_abbr": "UC Berkeley;;", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;2", "aff_country_unique": "United States;;United Kingdom" }, { "id": "9cTEQWMo1BF", "title": "LabelFormer: Object Trajectory Re\ufb01nement for Offboard Perception from LiDAR Point Clouds", "track": "main", "status": "Poster", "tldr": "We propose LabelFormer, a simple, efficient and effective transformer-based trajectory refinement method that reasons with full temporal context for LiDAR-based auto-labelling.", "abstract": "A major bottleneck to scaling-up training of self-driving perception systems are the human annotations required for supervision. A promising alternative is to leverage \u201cauto-labelling\u201d offboard perception models that are trained to automatically generate annotations from raw LiDAR point clouds at a fraction of the cost. Auto-labels are most commonly generated via a two-stage approach \u2013 first objects are detected and tracked over time, and then each object trajectory is passed to a learned refinement model to improve accuracy. Since existing refinement models are overly complex and lack advanced temporal reasoning capabilities, in this work we propose LabelFormer, a simple, efficient, and effective trajectory-level refinement approach. Our approach first encodes each frame\u2019s observations separately, then exploits self-attention to reason about the trajectory with full temporal context, and finally decodes the re\ufb01ned object size and per-frame poses. Evaluation on both urban and highway datasets demonstrates that LabelFormer outperforms existing works by a large margin. Finally, we show that training on a dataset augmented with auto-labels generated by our method leads to improved downstream detection performance compared to existing methods. Please visit the project website for details https://waabi.ai/labelformer/.", "keywords": "Auto-labelling;Offboard Perception;Trajectory Refinement;Transformer", "primary_area": "", "supplementary_material": "/attachment/1150fd797ad0edd449b62c34c052182cbb16b8ea.zip", "author": "Anqi Joyce Yang;Sergio Casas;Nikita Dvornik;Sean Segal;Yuwen Xiong;Jordan Sir Kwang Hu;Carter Fang;Raquel Urtasun", "authorids": "~Anqi_Joyce_Yang1;~Sergio_Casas2;~Nikita_Dvornik1;~Sean_Segal1;~Yuwen_Xiong1;~Jordan_Sir_Kwang_Hu1;~Carter_Fang1;~Raquel_Urtasun1", "gender": "F;M;M;M;M;;F;M", "homepage": "https://www.cs.toronto.edu/~ajyang/;https://dvornikita.github.io/;;http://www.yuwenxiong.com;https://jskhu.github.io/;;http://www.cs.toronto.edu/~urtasun/;http://www.cs.toronto.edu/~sergio/", "dblp": "283/5790;205/2510;251/3238;178/3589;;;u/RaquelUrtasun;46/6535-2", "google_scholar": "DxnwQqgAAAAJ;UOLJQTIAAAAJ;DfhYi2QAAAAJ;7YALCcIAAAAJ;aPx2zd8AAAAJ;;https://scholar.google.ca/citations?user=jyxO2akAAAAJ;Vgo1x9YAAAAJ", "orcid": ";;;;;;;", "linkedin": "ajyang99/;;;;jskhu/;https://ch.linkedin.com/in/carterfang;;sergio-casas/", "or_profile": "~Anqi_Joyce_Yang1;~Nikita_Dvornik1;~Sean_Segal1;~Yuwen_Xiong1;~Jordan_Sir_Kwang_Hu1;~Carter_Fang1;~Raquel_Urtasun1;~Sergio_Casas_Romero1", "aff": "Waabi Innovation Inc;Waabi;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Waabi Innovation Inc.;;Department of Computer Science, University of Toronto;University of Toronto", "aff_domain": "waabi.ai;waabi.ai;cs.toronto.edu;cs.toronto.edu;waabi.ai;;cs.toronto.edu;toronto.edu", "position": "Researcher;Researcher;PhD student;PhD student;Researcher;;Full Professor;PhD student", "bibtex": "@inproceedings{\nyang2023labelformer,\ntitle={LabelFormer: Object Trajectory Refinement for Offboard Perception from Li{DAR} Point Clouds},\nauthor={Anqi Joyce Yang and Sergio Casas and Nikita Dvornik and Sean Segal and Yuwen Xiong and Jordan Sir Kwang Hu and Carter Fang and Raquel Urtasun},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=9cTEQWMo1BF}\n}", "github": "", "project": "", "reviewers": "b8wo;pNiA;xWt1", "site": "https://openreview.net/forum?id=9cTEQWMo1BF", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;2", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8988723225784945740&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;2;3;2;2", "aff_unique_norm": "Waabi Innovation Inc;Waabi;University of Toronto;Waabi Innovation Inc.", "aff_unique_dep": ";;Department of Computer Science;", "aff_unique_url": "https://www.waabi.ai;;https://www.utoronto.ca;https://www.waabi.ai", "aff_unique_abbr": ";;U of T;", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada;" }, { "id": "AIgm8ZE_DlD", "title": "A Universal Semantic-Geometric Representation for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "A representation that integrates both semantic understanding and 3D spatial reasoning.", "abstract": "Robots rely heavily on sensors, especially RGB and depth cameras, to perceive and interact with the world. RGB cameras record 2D images with rich semantic information while missing precise spatial information. On the other side, depth cameras offer critical 3D geometry data but capture limited semantics. Therefore, integrating both modalities is crucial for learning representations for robotic perception and control. However, current research predominantly focuses on only one of these modalities, neglecting the benefits of incorporating both. To this end, we present $\\textbf{Semantic-Geometric Representation} (\\textbf{SGR})$, a universal perception module for robotics that leverages the rich semantic information of large-scale pre-trained 2D models and inherits the merits of 3D spatial reasoning. Our experiments demonstrate that SGR empowers the agent to successfully complete a diverse range of simulated and real-world robotic manipulation tasks, outperforming state-of-the-art methods significantly in both single-task and multi-task settings. Furthermore, SGR possesses the capability to generalize to novel semantic attributes, setting it apart from the other methods. Project website: https://semantic-geometric-representation.github.io.", "keywords": "Representation Learning;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/c9669bdf5140d7cdff9f402171676de94676bad1.zip", "author": "Tong Zhang;Yingdong Hu;Hanchen Cui;Hang Zhao;Yang Gao", "authorids": "~Tong_Zhang23;~Yingdong_Hu1;~Hanchen_Cui1;~Hang_Zhao1;~Yang_Gao1", "gender": ";M;;M;M", "homepage": "https://tongzhangthu.github.io/;;https://morning-star-7.github.io/;http://www.mit.edu/~hangzhao/;http://yang-gao.weebly.com", "dblp": ";219/8916;;;89/4402-29", "google_scholar": "https://scholar.google.com/citations?hl=en;HhotyAoAAAAJ;f12zgScAAAAJ;DmahiOYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;yang-gao-45245348/", "or_profile": "~Tong_Zhang23;~Yingdong_Hu1;~Hanchen_Cui1;~Hang_Zhao1;~Yang_Gao1", "aff": "Tsinghua University;Tsinghua University;Shanghai Qi Zhi Institute;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;sqz.ac.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023a,\ntitle={A Universal Semantic-Geometric Representation for Robotic Manipulation},\nauthor={Tong Zhang and Yingdong Hu and Hanchen Cui and Hang Zhao and Yang Gao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=AIgm8ZE_DlD}\n}", "github": "", "project": "", "reviewers": "B5ZP;ebNV;RMKE;e9U4", "site": "https://openreview.net/forum?id=AIgm8ZE_DlD", "pdf_size": 0, "rating": "4;4;6;10", "confidence": "3;3;4;3", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=763081554177145503&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Tsinghua University;Shanghai Qi Zhi Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.qz.io", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "ANJuNDFdvP", "title": "UniFolding: Towards Sample-efficient, Scalable, and Generalizable Robotic Garment Folding", "track": "main", "status": "Poster", "tldr": "A unified solution for generalizable robotic garment unfolding and folding", "abstract": "This paper explores the development of UniFolding, a sample-efficient, scalable, and generalizable robotic system for unfolding and folding various garments. UniFolding employs the proposed UFONet neural network to integrate unfolding and folding decisions into a single policy model that is adaptable to different garment types and states. The design of UniFolding is based on a garment's partial point cloud, which aids in generalization and reduces sensitivity to variations in texture and shape. The training pipeline prioritizes low-cost, sample-efficient data collection. Training data is collected via a human-centric process with offline and online stages. The offline stage involves human unfolding and folding actions via Virtual Reality, while the online stage utilizes human-in-the-loop learning to fine-tune the model in a real-world setting. The system is tested on two garment types: long-sleeve and short-sleeve shirts. Performance is evaluated on 20 shirts with significant variations in textures, shapes, and materials. More experiments and videos can be found in the supplementary materials and on the website: \\url{https://unifolding.robotflow.ai}.", "keywords": "Deformable Object Manipulation;Bimanual Manipulation;Garment Folding", "primary_area": "", "supplementary_material": "/attachment/6fa9a1dc52011bbd23b6094f599d79e9148b747c.zip", "author": "Han Xue;Yutong Li;Wenqiang Xu;Huanyu Li;Dongzhe Zheng;Cewu Lu", "authorids": "~Han_Xue1;~Yutong_Li3;~Wenqiang_Xu2;~Huanyu_Li2;~Dongzhe_Zheng1;~Cewu_Lu3", "gender": "M;M;M;;M;M", "homepage": "https://hanxue.me/;https://davidliyutong.github.io;;;;https://www.mvig.org/", "dblp": "65/6824;;;;359/9725.html;", "google_scholar": "https://scholar.google.com.hk/citations?user=kQMFnUkAAAAJ;KSv942gAAAAJ;PdzO-4YAAAAJ;;Szg1nBQAAAAJ;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ", "orcid": ";0009-0002-6959-6159;0000-0002-8648-5576;;0009-0007-4105-0628;", "linkedin": "\u5bd2-\u859b-8696a4168;;;;denzel-zheng-b4297921a;", "or_profile": "~Han_Xue1;~Yutong_Li3;~Wenqiang_Xu2;~Huanyu_Li2;~Dongzhe_Zheng1;~Cewu_Lu3", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;;sjtu.edu.cn;sjtu.edu.cn", "position": "MS student;MS student;PhD student;;Researcher;Full Professor", "bibtex": "@inproceedings{\nxue2023unifolding,\ntitle={UniFolding: Towards Sample-efficient, Scalable, and Generalizable Robotic Garment Folding},\nauthor={Han Xue and Yutong Li and Wenqiang Xu and Huanyu Li and Dongzhe Zheng and Cewu Lu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ANJuNDFdvP}\n}", "github": "https://github.com/xiaoxiaoxh/UniFolding", "project": "", "reviewers": "Pcsh;Aqt2;3fbS;BruB", "site": "https://openreview.net/forum?id=ANJuNDFdvP", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;4;3", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7919467870844188172&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "AnDDMQgM7-", "title": "Equivariant Reinforcement Learning under Partial Observability", "track": "main", "status": "Poster", "tldr": "This paper embeds domain symmetry in actor-critic reinforcement learning agents to solve a specific class of partially observable domains", "abstract": "Incorporating inductive biases is a promising approach for tackling challenging robot learning domains with sample-efficient solutions. This paper identifies partially observable domains where symmetries can be a useful inductive bias for efficient learning. Specifically, by encoding the equivariance regarding specific group symmetries into the neural networks, our actor-critic reinforcement learning agents can reuse solutions in the past for related scenarios. Consequently, our equivariant agents outperform non-equivariant approaches significantly in terms of sample efficiency and final performance, demonstrated through experiments on a range of robotic tasks in simulation and real hardware.", "keywords": "Partial Observability;Equivariant Learning;Symmetry", "primary_area": "", "supplementary_material": "/attachment/04a0066fdab4c9592e6731f1b175b9aa87c4db2c.zip", "author": "Hai Huu Nguyen;Andrea Baisero;David Klee;Dian Wang;Robert Platt;Christopher Amato", "authorids": "~Hai_Huu_Nguyen1;~Andrea_Baisero1;~David_Klee1;~Dian_Wang1;~Robert_Platt1;~Christopher_Amato1", "gender": "M;M;M;M;;M", "homepage": "https://hai-h-nguyen.github.io/;;;https://pointw.github.io/;http://www.ccs.neu.edu/home/rplatt/;http://www.ccs.neu.edu/home/camato/index.html", "dblp": ";135/3247;313/9930;191/1369-1;39/5434;10/3254", "google_scholar": "5b9ncWoAAAAJ;;TJEEkJoAAAAJ;CckjtfQAAAAJ;Z4Y5S2oAAAAJ;-8-sD-sAAAAJ", "orcid": ";;;;;", "linkedin": ";;;dianwang1007;;", "or_profile": "~Hai_Huu_Nguyen1;~Andrea_Baisero1;~David_Klee1;~Dian_Wang1;~Robert_Platt1;~Christopher_Amato1", "aff": "Northeastern University;Northeastern University;Boston Dynamics Artificial Intelligence Institute;Boston Dynamics AI Institute;Northeastern University;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;theaiinstitute.com;theaiinstitute.com;neu.edu;northeastern.edu", "position": "PhD student;PhD student;Intern;Intern;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nnguyen2023equivariant,\ntitle={Equivariant Reinforcement Learning under Partial Observability},\nauthor={Hai Huu Nguyen and Andrea Baisero and David Klee and Dian Wang and Robert Platt and Christopher Amato},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=AnDDMQgM7-}\n}", "github": "https://github.com/hai-h-nguyen/equi-rl-for-pomdps", "project": "", "reviewers": "Pu8X;78Fy;e6B6;jfAZ", "site": "https://openreview.net/forum?id=AnDDMQgM7-", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;3;4", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6869063856189094683&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Northeastern University;Boston Dynamics Artificial Intelligence Institute;Boston Dynamics AI Institute", "aff_unique_dep": ";Artificial Intelligence;AI Institute", "aff_unique_url": "https://www.northeastern.edu;https://www.bostondynamics.com/;https://www.bostondynamics.com/", "aff_unique_abbr": "NEU;BD AI;BD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ApxLUk8U-l", "title": "Self-Improving Robots: End-to-End Autonomous Visuomotor Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "A practical and efficient real-world robot system that can self-improve by reinforcement learning.", "abstract": "In imitation and reinforcement learning (RL), the cost of human supervision limits the amount of data that the robots can be trained on. While RL offers a framework for building self-improving robots that can learn via trial-and-error autonomously, practical realizations end up requiring extensive human supervision for reward function design and repeated resetting of the environment between episodes of interactions. In this work, we propose MEDAL++, a novel design for self-improving robotic systems: given a small set of expert demonstrations at the start, the robot autonomously practices the task by learning to both do and undo the task, simultaneously inferring the reward function from the demonstrations. The policy and reward function are learned end-to-end from high-dimensional visual inputs, bypassing the need for explicit state estimation or task-specific pre-training for visual encoders used in prior work. We first evaluate our proposed system on a simulated non-episodic benchmark EARL, finding that MEDAL++ is both more data efficient and gets up to 30% better final performance compared to state-of-the-art vision-based methods. Our real-robot experiments show that MEDAL++ can be applied to manipulation problems in larger environments than those considered in prior work, and autonomous self-improvement can improve the success rate by 30% to 70% over behavioral cloning on just the expert data.", "keywords": "reinforcement learning;autonomous;reset-free;manipulation", "primary_area": "", "supplementary_material": "/attachment/2526dff7dcf90cb1c09bab833722188e491deb31.zip", "author": "Archit Sharma;Ahmed M Ahmed;Rehaan Ahmad;Chelsea Finn", "authorids": "~Archit_Sharma1;~Ahmed_M_Ahmed1;~Rehaan_Ahmad1;~Chelsea_Finn1", "gender": "M;M;M;F", "homepage": ";;;https://ai.stanford.edu/~cbfinn/", "dblp": "220/3163.html;;;131/1783", "google_scholar": "_0IIzxgAAAAJ;;-bA8eT4AAAAJ;vfPE6hgAAAAJ", "orcid": ";;;", "linkedin": ";ahmed-ahmed-13914510a/;;", "or_profile": "~Archit_Sharma1;~Ahmed_M_Ahmed1;~Rehaan_Ahmad1;~Chelsea_Finn1", "aff": "Stanford University;;Computer Science Department, Stanford University;Google", "aff_domain": "stanford.edu;;cs.stanford.edu;google.com", "position": "Graduate Student;;Undergrad student;Research Scientist", "bibtex": "@inproceedings{\nsharma2023selfimproving,\ntitle={Self-Improving Robots: End-to-End Autonomous Visuomotor Reinforcement Learning},\nauthor={Archit Sharma and Ahmed M Ahmed and Rehaan Ahmad and Chelsea Finn},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ApxLUk8U-l}\n}", "github": "https://github.com/rehaanahmad2013/self-improving-robots/", "project": "", "reviewers": "xqXG;uNtL;Y525", "site": "https://openreview.net/forum?id=ApxLUk8U-l", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14673943475073273759&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "AyRr_i028w", "title": "Cold Diffusion on the Replay Buffer: Learning to Plan from Known Good States", "track": "main", "status": "Poster", "tldr": "By using cold diffusion over a Replay Buffer, we can learn to imitate while avoiding infeasible states.", "abstract": "Learning from demonstrations (LfD) has successfully trained robots to exhibit remarkable generalization capabilities. However, many powerful imitation techniques do not prioritize the feasibility of the robot behaviors they generate. In this work, we explore the feasibility of plans produced by LfD. As in prior work, we employ a temporal diffusion model with fixed start and goal states to facilitate imitation through in-painting. Unlike previous studies, we apply cold diffusion to ensure the optimization process is directed through the agent's replay buffer of previously visited states. This routing approach increases the likelihood that the final trajectories will predominantly occupy the feasible region of the robot's state space. We test this method in simulated robotic environments with obstacles and observe a significant improvement in the agent's ability to avoid these obstacles during planning.", "keywords": "Imitation Learning;Reinforcement Learning;Diffusion;Cold Diffusion;Planning;Safety", "primary_area": "", "supplementary_material": "", "author": "Zidan Wang;Takeru Oba;Takuma Yoneda;Rui Shen;Matthew Walter;Bradly C. Stadie", "authorids": "~Zidan_Wang1;~Takeru_Oba1;~Takuma_Yoneda1;~Rui_Shen1;~Matthew_Walter1;~Bradly_C._Stadie1", "gender": "F;M;M;M;M;", "homepage": "https://github.com/zidanwang2025;https://obat2343.wixsite.com/my-site;https://takuma.yoneda.xyz/;;http://ttic.edu/walter;", "dblp": ";282/8759;;;50/7734;166/1368", "google_scholar": ";Dwb-5UgAAAAJ;EtYv_AIAAAAJ;;RAiewnEAAAAJ;", "orcid": ";;;;0000-0003-1425-6050;", "linkedin": ";;;rui-shen-rs/;;", "or_profile": "~Zidan_Wang1;~Takeru_Oba1;~Takuma_Yoneda1;~Rui_Shen1;~Matthew_Walter1;~Bradly_C._Stadie1", "aff": "Northwestern University;Toyota Technological Institute;Toyota Technological Institute at Chicago;Yale University;Toyota Technological Institute at Chicago;Northwestern University", "aff_domain": "u.northwestern.edu;toyota-ti.ac.jp;ttic.edu;yale.edu;ttic.edu;northwestern.edu", "position": "PhD student;PhD student;PhD student;MS student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2023cold,\ntitle={Cold Diffusion on the Replay Buffer: Learning to Plan from Known Good States},\nauthor={Zidan Wang and Takeru Oba and Takuma Yoneda and Rui Shen and Matthew Walter and Bradly C. Stadie},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=AyRr_i028w}\n}", "github": "https://github.com/zidanwang2025/cold_diffusion_on_replay_buffer", "project": "", "reviewers": "Qj78;aADh;JCXJ;MbuE", "site": "https://openreview.net/forum?id=AyRr_i028w", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;4;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 1.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8619956806685677835&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;3;2;0", "aff_unique_norm": "Northwestern University;Toyota Technological Institute;Toyota Technological Institute at Chicago;Yale University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.northwestern.edu;https://www.tti.ac.jp;https://www.tti-chicago.org;https://www.yale.edu", "aff_unique_abbr": "NU;TTI;TTI Chicago;Yale", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Chicago", "aff_country_unique_index": "0;1;0;0;0;0", "aff_country_unique": "United States;Japan" }, { "id": "B7PnAw4ze0l", "title": "Precise Robotic Needle-Threading with Tactile Perception and Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "A tactile-perception-based solution to robotic needle threading task", "abstract": "This work presents a novel tactile perception-based method, named T-NT, for performing the needle-threading task, an application of deformable linear object (DLO) manipulation. This task is divided into two main stages: \\textit{Tail-end Finding} and \\textit{Tail-end Insertion}. In the first stage, the agent traces the contour of the thread twice using vision-based tactile sensors mounted on the gripper fingers. The two-run tracing is to locate the tail-end of the thread.\n In the second stage, it employs a tactile-guided reinforcement learning (RL) model to drive the robot to insert the thread into the target needle eyelet. The RL model is trained in a Unity-based simulated environment. The simulation environment supports tactile rendering which can produce realistic tactile images and thread modeling. During insertion, the position of the poke point and the center of the eyelet are obtained through a pre-trained segmentation model, Grounded-SAM, which predicts the masks for both the needle eye and thread imprints. These positions are then fed into the reinforcement learning model, aiding in a smoother transition to real-world applications. Extensive experiments on real robots are conducted to demonstrate the efficacy of our method. More experiments and videos can be found in the supplementary materials and on the website: \\url{https://sites.google.com/view/tac-needlethreading}.", "keywords": "tactile perception;needle threading", "primary_area": "", "supplementary_material": "/attachment/4e1fe239501887e26bc421b4f63f411a5a5eb8e6.zip", "author": "Zhenjun Yu;Wenqiang Xu;Siqiong Yao;Jieji Ren;Tutian Tang;Yutong Li;Guoying Gu;Cewu Lu", "authorids": "~Zhenjun_Yu1;~Wenqiang_Xu2;~Siqiong_Yao1;~Jieji_Ren1;~Tutian_Tang1;~Yutong_Li3;~Guoying_Gu1;~Cewu_Lu3", "gender": "M;M;F;M;M;M;;M", "homepage": ";;;;https://github.com/ElectronicElephant;https://davidliyutong.github.io;http://softrobotics.sjtu.edu.cn/;https://www.mvig.org/", "dblp": ";;;278/8428;;;;", "google_scholar": ";PdzO-4YAAAAJ;;;;KSv942gAAAAJ;ac8jed8AAAAJ;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ", "orcid": ";0000-0002-8648-5576;0000-0001-6968-1586;0000-0001-6381-6830;;0009-0002-6959-6159;;", "linkedin": "https://www.linkedin.cn/incareer/in/ACoAADkd-OQBIZghy47hQafZehw_kvmRaWC-RvY;;;;;;;", "or_profile": "~Zhenjun_Yu1;~Wenqiang_Xu2;~Siqiong_Yao1;~Jieji_Ren1;~Tutian_Tang1;~Yutong_Li3;~Guoying_Gu1;~Cewu_Lu3", "aff": "Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University;Shanghai Jiaotong University", "aff_domain": "sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;sjtu.edu.cn", "position": "Undergrad student;PhD student;Assistant Professor;Postdoc;PhD student;MS student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyu2023precise,\ntitle={Precise Robotic Needle-Threading with Tactile Perception and Reinforcement Learning},\nauthor={Zhenjun Yu and Wenqiang Xu and Siqiong Yao and Jieji Ren and Tutian Tang and Yutong Li and Guoying Gu and Cewu Lu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=B7PnAw4ze0l}\n}", "github": "", "project": "", "reviewers": "zrs7;t9QJ;uQnm;KsbS", "site": "https://openreview.net/forum?id=B7PnAw4ze0l", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;3;4;5", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 15, "authors#_avg": 8, "corr_rating_confidence": 0.20751433915982243, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6359310541336816869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Shanghai Jiao Tong University", "aff_unique_dep": "", "aff_unique_url": "https://www.sjtu.edu.cn", "aff_unique_abbr": "SJTU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "BimpCf1rT7", "title": "Compositional Diffusion-Based Continuous Constraint Solvers", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper introduces an approach for learning to solve continuous constraint satisfaction problems (CCSP) in robotic reasoning and planning. Previous methods primarily rely on hand-engineering or learning generators for specific constraint types and then rejecting the value assignments when other constraints are violated. By contrast, our model, the compositional diffusion continuous constraint solver (Diffusion-CCSP) derives global solutions to CCSPs by representing them as factor graphs and combining the energies of diffusion models trained to sample for individual constraint types. Diffusion-CCSP exhibits strong generalization to novel combinations of known constraints, and it can be integrated into a task and motion planner to devise long-horizon plans that include actions with both discrete and continuous parameters.", "keywords": "Diffusion Models;Constraint Satisfaction Problems;Task and Motion Planning", "primary_area": "", "supplementary_material": "/attachment/620b97fb345b93aa110b0afc0892ed87edb555de.zip", "author": "Zhutian Yang;Jiayuan Mao;Yilun Du;Jiajun Wu;Joshua B. Tenenbaum;Tom\u00e1s Lozano-P\u00e9rez;Leslie Pack Kaelbling", "authorids": "~Zhutian_Yang1;~Jiayuan_Mao1;~Yilun_Du1;~Jiajun_Wu1;~Joshua_B._Tenenbaum1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "gender": "F;F;;M;;M;F", "homepage": "https://zt-yang.com;http://jiayuanm.com;https://yilundu.github.io;https://jiajunwu.com;;http://people.csail.mit.edu/tlp/;http://people.csail.mit.edu/lpk/", "dblp": ";200/8283;204/4379;117/4768;t/JoshuaBTenenbaum;90/752;k/LesliePackKaelbling", "google_scholar": "vW5LLmUAAAAJ;-xaOIZIAAAAJ;;2efgcS0AAAAJ;;gQOKAggAAAAJ;IcasIiwAAAAJ", "orcid": ";0000-0003-4798-3748;;0000-0002-4176-343X;;;0000-0001-6054-7145", "linkedin": "zhutian-yang/;;;jiajunwu/;;;", "or_profile": "~Zhutian_Yang1;~Jiayuan_Mao1;~Yilun_Du1;~Jiajun_Wu1;~Joshua_B._Tenenbaum1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Stanford University;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;stanford.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor;Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2023compositional,\ntitle={Compositional Diffusion-Based Continuous Constraint Solvers},\nauthor={Zhutian Yang and Jiayuan Mao and Yilun Du and Jiajun Wu and Joshua B. Tenenbaum and Tom{\\'a}s Lozano-P{\\'e}rez and Leslie Pack Kaelbling},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=BimpCf1rT7}\n}", "github": "", "project": "", "reviewers": "drxW;wq1w;88cB", "site": "https://openreview.net/forum?id=BimpCf1rT7", "pdf_size": 0, "rating": "6;10;10", "confidence": "2;4;5", "rating_avg": 8.666666666666666, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 7, "corr_rating_confidence": 0.9449111825230679, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7167103910856407796&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;https://www.stanford.edu", "aff_unique_abbr": "MIT;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "BzjLaVvr955", "title": "Topology-Matching Normalizing Flows for Out-of-Distribution Detection in Robot Learning", "track": "main", "status": "Poster", "tldr": "We propose to equip NFs with efficient but flexible base distributions to overcome the topological constraint for OOD detection in robot learning.", "abstract": "To facilitate reliable deployments of autonomous robots in the real world, Out-of-Distribution (OOD) detection capabilities are often required. A powerful approach for OOD detection is based on density estimation with Normalizing Flows (NFs). However, we find that prior work with NFs attempts to match the complex target distribution topologically with na\u0131\u0308ve base distributions leading to adverse implications. In this work, we circumvent this topological mismatch using an expressive class-conditional base distribution trained with an information-theoretic objective to match the required topology. The proposed method enjoys the merits of wide compatibility with existing learned models without any performance degradation and minimum computation overhead while enhancing OOD detection capabilities. We demonstrate superior results in density estimation and 2D object detection benchmarks in comparison with extensive baselines. Moreover, we showcase the applicability of the method with a real-robot deployment.", "keywords": "Normalizing Flows;Out-of-Distribution Detection;Robotic Introspection", "primary_area": "", "supplementary_material": "/attachment/25e26bfae5b36b8fdc9f99228f0373ad7cd18806.zip", "author": "Jianxiang Feng;Jongseok Lee;Simon Geisler;Stephan G\u00fcnnemann;Rudolph Triebel", "authorids": "~Jianxiang_Feng1;~Jongseok_Lee1;~Simon_Geisler1;~Stephan_G\u00fcnnemann1;~Rudolph_Triebel1", "gender": "M;M;M;M;M", "homepage": ";https://rmc.dlr.de/rm/en/staff/jongseok.lee/;http://www.daml.in.tum.de;https://rmc.dlr.de/rm/de/staff/rudolph.triebel/;https://www.in.tum.de/en/daml/team/simon-geisler/", "dblp": "267/9411;58/4966;43/3011;85/443;237/0253", "google_scholar": "b-5CscIAAAAJ;3GaQJP8AAAAJ;;SuOUxjUAAAAJ;00x9jJwAAAAJ", "orcid": ";;;0000-0002-7975-036X;0000-0003-0867-1856", "linkedin": ";jongseok-lee-b75362118;;;simon-geisler-ai/", "or_profile": "~Jianxiang_Feng1;~Jongseok_Lee1;~Stephan_G\u00fcnnemann1;~Rudolph_A_Triebel1;~Simon_Markus_Geisler1", "aff": "RMC, German Aerospace Center (DLR);German Aerospace Center (DLR);Technical University Munich;Technical University Munich;Technical University Munich", "aff_domain": "dlr.de;dlr.de;tum.de;tum.de;tum.de", "position": "Researcher;Researcher;Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nfeng2023topologymatching,\ntitle={Topology-Matching Normalizing Flows for Out-of-Distribution Detection in Robot Learning},\nauthor={Jianxiang Feng and Jongseok Lee and Simon Geisler and Stephan G{\\\"u}nnemann and Rudolph Triebel},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=BzjLaVvr955}\n}", "github": "https://github.com/DLR-RM", "project": "", "reviewers": "MswT;hs3b;q52p;X7BF", "site": "https://openreview.net/forum?id=BzjLaVvr955", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;3", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 9, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15039042558471916340&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;1;1", "aff_unique_norm": "German Aerospace Center;Technical University of Munich", "aff_unique_dep": "RMC;", "aff_unique_url": "https://www.dlr.de;https://www.tum.de", "aff_unique_abbr": "DLR;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany" }, { "id": "C5MQUlzhVjQ", "title": "Surrogate Assisted Generation of Human-Robot Interaction Scenarios", "track": "main", "status": "Oral", "tldr": "We present an efficient method of automatically generating challenging and diverse human-robot interaction scenarios.", "abstract": "As human-robot interaction (HRI) systems advance, so does the difficulty of evaluating and understanding the strengths and limitations of these systems in different environments and with different users. To this end, previous methods have algorithmically generated diverse scenarios that reveal system failures in a shared control teleoperation task. However, these methods require directly evaluating generated scenarios by simulating robot policies and human actions. The computational cost of these evaluations limits their applicability in more complex domains. Thus, we propose augmenting scenario generation systems with surrogate models that predict both human and robot behaviors. In the shared control teleoperation domain and a more complex shared workspace collaboration task, we show that surrogate assisted scenario generation efficiently synthesizes diverse datasets of challenging scenarios. We demonstrate that these failures are reproducible in real-world interactions.", "keywords": "Scenario Generation;Human-Robot Interaction;Quality Diversity", "primary_area": "", "supplementary_material": "/attachment/8792239ae5c278274bb15547a9dc3e2e03f97240.zip", "author": "Varun Bhatt;Heramb Nemlekar;Matthew Christopher Fontaine;Bryon Tjanaka;Hejia Zhang;Ya-Chuan Hsu;Stefanos Nikolaidis", "authorids": "~Varun_Bhatt1;~Heramb_Nemlekar1;~Matthew_Christopher_Fontaine1;~Bryon_Tjanaka1;~Hejia_Zhang1;~Ya-Chuan_Hsu1;~Stefanos_Nikolaidis1", "gender": ";;M;Not Specified;M;F;", "homepage": ";https://herambnemlekar.github.io/;;https://btjanaka.net;https://www.hejiazhang.me;https://scholar.google.com/citations?user=40WQ9NwAAAAJ&hl=en;http://stefanosnikolaidis.net/", "dblp": "226/9861;;239/8516;277/1380;172/9965;;62/6555", "google_scholar": "OgAUSRMAAAAJ;;RqSvzikAAAAJ;851Y-O8AAAAJ;h_0iAx4AAAAJ;;", "orcid": ";;;0000-0002-9602-5039;;;", "linkedin": "varun-bhatt-049a49168/;;;btjanaka/;hejia-zhang-3914bb154/;;", "or_profile": "~Varun_Bhatt1;~Heramb_Nemlekar1;~Matthew_Christopher_Fontaine1;~Bryon_Tjanaka1;~Hejia_Zhang1;~Ya-Chuan_Hsu1;~Stefanos_Nikolaidis1", "aff": "University of Southern California;University of Southern California;University of Southern California;InstaDeep;University of Southern California;University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu;usc.edu;instadeep.com;usc.edu;usc.edu;usc.edu", "position": "PhD student;PhD student;PhD student;Intern;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nbhatt2023surrogate,\ntitle={Surrogate Assisted Generation of Human-Robot Interaction Scenarios},\nauthor={Varun Bhatt and Heramb Nemlekar and Matthew Christopher Fontaine and Bryon Tjanaka and Hejia Zhang and Ya-Chuan Hsu and Stefanos Nikolaidis},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=C5MQUlzhVjQ}\n}", "github": "https://github.com/icaros-usc/dsas", "project": "", "reviewers": "rtcn;ZDjU;EBp2;3aQ9", "site": "https://openreview.net/forum?id=C5MQUlzhVjQ", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "2;3;4;3", "rating_avg": 8.0, "confidence_avg": 3.0, "replies_avg": 14, "authors#_avg": 7, "corr_rating_confidence": 0.7071067811865475, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16150408863829381069&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "University of Southern California;InstaDeep", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.instadeep.com", "aff_unique_abbr": "USC;InstaDeep", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "CnKf9TyYtf2", "title": "Few-Shot In-Context Imitation Learning via Implicit Graph Alignment", "track": "main", "status": "Poster", "tldr": "We learn how to align graph representations of objects and use it as a foundation of a few-shot in-context imitation learning framework.", "abstract": "Consider the following problem: given a few demonstrations of a task across a few different objects, how can a robot learn to perform that same task on new, previously unseen objects? This is challenging because the large variety of objects within a class makes it difficult to infer the task-relevant relationship between the new objects and the objects in the demonstrations. We address this by formulating imitation learning as a conditional alignment problem between graph representations of objects. Consequently, we show that this conditioning allows for in-context learning, where a robot can perform a task on a set of new objects immediately after the demonstrations, without any prior knowledge about the object class or any further training. In our experiments, we explore and validate our design choices, and we show that our method is highly effective for few-shot learning of several real-world, everyday tasks, whilst outperforming baselines. Videos are available on our project webpage at https://www.robot-learning.uk/implicit-graph-alignment.", "keywords": "Few-Shot Imitation Learning;Graph Neural Networks", "primary_area": "", "supplementary_material": "/attachment/137e90875c6c79fc25006093547edc3dac67bc3e.zip", "author": "Vitalis Vosylius;Edward Johns", "authorids": "~Vitalis_Vosylius1;~Edward_Johns1", "gender": "M;M", "homepage": "https://www.google.com/;https://www.robot-learning.uk", "dblp": "272/8659;68/9968", "google_scholar": "nktafp8AAAAJ;https://scholar.google.co.uk/citations?user=sMIUkiQAAAAJ", "orcid": ";0000-0002-8914-8786", "linkedin": ";https://uk.linkedin.com/in/edward-johns-1b24845a", "or_profile": "~Vitalis_Vosylius1;~Edward_Johns1", "aff": "Imperial College London;Imperial College London", "aff_domain": "imperial.ac.uk;imperial.ac.uk", "position": "PhD student;Associate Professor", "bibtex": "@inproceedings{\nvosylius2023fewshot,\ntitle={Few-Shot In-Context Imitation Learning via Implicit Graph Alignment},\nauthor={Vitalis Vosylius and Edward Johns},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=CnKf9TyYtf2}\n}", "github": "", "project": "", "reviewers": "UYjn;R4fh;4Z1U;r1Xw", "site": "https://openreview.net/forum?id=CnKf9TyYtf2", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "2;4;4;4", "rating_avg": 8.0, "confidence_avg": 3.5, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16159419080032316194&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "D0X97ODIYK", "title": "Structural Concept Learning via Graph Attention for Multi-Level Rearrangement Planning", "track": "main", "status": "Poster", "tldr": "We present SCL: A scalable graph attention network based approach for object rearrangement planning to build complex, multi-level structures.", "abstract": "Robotic manipulation tasks, such as object rearrangement, play a crucial role in enabling robots to interact with complex and arbitrary environments. Existing work focuses primarily on single-level rearrangement planning and, even if multiple levels exist, dependency relations among substructures are geometrically simpler, like tower stacking. We propose Structural Concept Learning (SCL), a deep learning approach that leverages graph attention networks to perform multi-level object rearrangement planning for scenes with structural dependency hierarchies. It is trained on a self-generated simulation data set with intuitive structures, works for unseen scenes with an arbitrary number of objects and higher complexity of structures, infers independent substructures to allow for task parallelization over multiple manipulators, and generalizes to the real world. We compare our method with a range of classical and model-based baselines to show that our method leverages its scene understanding to achieve better performance, flexibility, and efficiency. The dataset, demonstration videos, supplementary details, and code implementation are available at: https://manavkulshrestha.github.io/scl", "keywords": "Rearrangement Planning;Robot Manipulation;Graph Attention", "primary_area": "", "supplementary_material": "/attachment/c4f7cde3897f65d231b7631d18588851fe03b893.zip", "author": "Manav Kulshrestha;Ahmed H Qureshi", "authorids": "~Manav_Kulshrestha1;~Ahmed_H_Qureshi1", "gender": "M;M", "homepage": ";https://qureshiahmed.github.io/", "dblp": ";222/2796", "google_scholar": ";https://scholar.google.com/citations?hl=en", "orcid": ";", "linkedin": "manav-kulshrestha;", "or_profile": "~Manav_Kulshrestha1;~Ahmed_Qureshi1", "aff": "Purdue University;Purdue University", "aff_domain": "purdue.edu;purdue.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkulshrestha2023structural,\ntitle={Structural Concept Learning via Graph Attention for Multi-Level Rearrangement Planning},\nauthor={Manav Kulshrestha and Ahmed H Qureshi},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=D0X97ODIYK}\n}", "github": "", "project": "", "reviewers": "hfsk;mpK3;fLRU;Ej1j", "site": "https://openreview.net/forum?id=D0X97ODIYK", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;3;5;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1185823787354948367&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Purdue University", "aff_unique_dep": "", "aff_unique_url": "https://www.purdue.edu", "aff_unique_abbr": "Purdue", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "DYPOvNot5F", "title": "Diff-LfD: Contact-aware Model-based Learning from Visual Demonstration for Robotic Manipulation via Differentiable Physics-based Simulation and Rendering", "track": "main", "status": "Oral", "tldr": "", "abstract": "Learning from Demonstration (LfD) is an efficient technique for robots to acquire new skills through expert observation, significantly mitigating the need for laborious manual reward function design. This paper introduces a novel framework for model-based LfD in the context of robotic manipulation. Our proposed pipeline is underpinned by two primary components: self-supervised pose and shape estimation and contact sequence generation. The former utilizes differentiable rendering to estimate object poses and shapes from demonstration videos, while the latter iteratively optimizes contact points and forces using differentiable simulation, consequently effectuating object transformations. Empirical evidence demonstrates the efficacy of our LfD pipeline in acquiring manipulation actions from human demonstrations. Complementary to this, ablation studies focusing on object tracking and contact sequence inference underscore the robustness and efficiency of our approach in generating long-horizon manipulation actions, even amidst environmental noise. Validation of our results extends to real-world deployment of the proposed pipeline. Supplementary materials and videos are available on our webpage.", "keywords": "Learning from Demonstration;model-based robot learning;differentiable physics-based simulation and rendering", "primary_area": "", "supplementary_material": "/attachment/fedc01071ce91693a346dd4f598c517ba1a5af4f.zip", "author": "Xinghao Zhu;Jinghan Ke;Zhixuan Xu;Zhixin Sun;Bizhe Bai;Jun Lv;Qingtao Liu;Yuwei Zeng;Qi Ye;Cewu Lu;Masayoshi Tomizuka;Lin Shao", "authorids": "~Xinghao_Zhu1;~Jinghan_Ke2;~Zhixuan_Xu1;~Zhixin_Sun1;~Bizhe_Bai1;~Jun_Lv2;~Qingtao_Liu2;~Yuwei_Zeng1;~Qi_Ye2;~Cewu_Lu3;~Masayoshi_Tomizuka2;~Lin_Shao2", "gender": ";;M;M;M;M;M;;;M;;M", "homepage": ";;https://ariszxxu.github.io;;https://baibizhe.github.io;https://lyuj1998.github.io/;https://lqts.github.io/;https://friolero.github.io/;https://sites.google.com/site/qiyeincv/;https://www.mvig.org/;;https://linsats.github.io/", "dblp": ";;;;323/6206;;;;;;;26/8546-2", "google_scholar": ";;OL_axPMAAAAJ;eZWZVNEAAAAJ;PVSMSqQAAAAJ;DtaiAjwAAAAJ;;PqvAzW4AAAAJ;4D-HZ98AAAAJ;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;;0000-0003-2285-3402;;;", "linkedin": ";;;zhixin-sun-867335292/;;;;;;;;", "or_profile": "~Xinghao_Zhu1;~Jinghan_Ke2;~Zhixuan_Xu1;~Zhixin_Sun1;~Bizhe_Bai1;~Jun_Lv2;~Qingtao_Liu2;~Yuwei_Zeng1;~Qi_Ye2;~Cewu_Lu3;~Masayoshi_Tomizuka2;~Lin_Shao2", "aff": ";;Zhejiang University;Nanjing University;University of Queensland;Shanghai Jiaotong University;Zhejiang University;National University of Singapore;Zhejiang University;Shanghai Jiaotong University;;National University of Singapore", "aff_domain": ";;zju.edu.cn;nju.edu.cn;uq.edu.au;sjtu.edu.cn;zju.edu.cn;comp.nus.edu.sg;zju.edu.cn;sjtu.edu.cn;;nus.edu.sg", "position": ";;Undergrad student;Undergrad student;MS student;PhD student;PhD student;PhD student;Assistant Professor;Full Professor;;Assistant Professor", "bibtex": "@inproceedings{\nzhu2023difflfd,\ntitle={Diff-LfD: Contact-aware Model-based Learning from Visual Demonstration for Robotic Manipulation via Differentiable Physics-based Simulation and Rendering},\nauthor={Xinghao Zhu and Jinghan Ke and Zhixuan Xu and Zhixin Sun and Bizhe Bai and Jun Lv and Qingtao Liu and Yuwei Zeng and Qi Ye and Cewu Lu and Masayoshi Tomizuka and Lin Shao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=DYPOvNot5F}\n}", "github": "", "project": "", "reviewers": "jZUA;YF65;Qw95;kobW", "site": "https://openreview.net/forum?id=DYPOvNot5F", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;3;4;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 15, "authors#_avg": 12, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11306151065581134841&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;3;0;4;0;3;4", "aff_unique_norm": "Zhejiang University;Nanjing University;University of Queensland;Shanghai Jiao Tong University;National University of Singapore", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.zju.edu.cn;https://www.nju.edu.cn;https://www.uq.edu.au;https://www.sjtu.edu.cn;https://www.nus.edu.sg", "aff_unique_abbr": "ZJU;Nanjing U;UQ;SJTU;NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;2;0;0;2", "aff_country_unique": "China;Australia;Singapore" }, { "id": "E2vL12SwO1", "title": "PreCo: Enhancing Generalization in Co-Design of Modular Soft Robots via Brain-Body Pre-Training", "track": "main", "status": "Oral", "tldr": "Enhancing Generalization in Co-Design of Modular Soft Robots via Brain-Body Pre-Training", "abstract": "Brain-body co-design, which involves the collaborative design of control strategies and morphologies, has emerged as a promising approach to enhance a robot's adaptability to its environment. However, the conventional co-design process often starts from scratch, lacking the utilization of prior knowledge. This can result in time-consuming and costly endeavors. In this paper, we present PreCo, a novel methodology that efficiently integrates brain-body pre-training into the co-design process of modular soft robots. PreCo is based on the insight of embedding co-design principles into models, achieved by pre-training a universal co-design policy on a diverse set of tasks. This pre-trained co-designer is utilized to generate initial designs and control policies, which are then fine-tuned for specific co-design tasks. Through experiments on a modular soft robot system, our method demonstrates zero-shot generalization to unseen co-design tasks, facilitating few-shot adaptation while significantly reducing the number of policy iterations required.", "keywords": "Robot Co-design;Pre-training;Reinforcement Learning;Modular Soft Robots", "primary_area": "", "supplementary_material": "/attachment/f051996905d809b9ce0330a2262489e0f2445699.zip", "author": "Yuxing Wang;Shuang Wu;Tiantian Zhang;Yongzhe Chang;Haobo Fu;QIANG FU;Xueqian Wang", "authorids": "~Yuxing_Wang3;~Shuang_Wu3;~Tiantian_Zhang2;~Yongzhe_Chang1;~Haobo_Fu2;~QIANG_FU8;~Xueqian_Wang1", "gender": "M;F;M;M;M;M;M", "homepage": "https://yuxing-wang-thu.github.io/;;;;;;", "dblp": "82/10142;92/6866-2.html;;85/8571;;43/3563-1;85/3231", "google_scholar": "ac-6jfMAAAAJ;https://scholar.google.com.hk/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=7iRQyFcAAAAJ;LFdJXNcAAAAJ;gANaxT0AAAAJ;h9dN_ykAAAAJ;https://scholar.google.com.hk/citations?user=_MtBmxkAAAAJ", "orcid": ";0000-0002-0204-4758;0000-0002-9083-5348;;;0000-0003-3542-0593;0000-0003-2772-4511", "linkedin": ";;;haobo-fu-382b0784/;;;", "or_profile": "~Yuxing_Wang3;~Tiantian_Zhang2;~Yongzhe_Chang1;~Haobo_Fu2;~QIANG_FU8;~Xueqian_Wang1;~shuang_wu2", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tencent AI Lab;Tencent AI Lab;Tsinghua University;Tencent AI Lab", "aff_domain": "mails.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;tencent.com;tencent.com;tsinghua.edu.cn;tencent.com", "position": "MS student;PhD student;Researcher;Principal Researcher;Principal Researcher;Full Professor;Researcher", "bibtex": "@inproceedings{\nwang2023preco,\ntitle={PreCo: Enhancing Generalization in Co-Design of Modular Soft Robots via Brain-Body Pre-Training},\nauthor={Yuxing Wang and Shuang Wu and Tiantian Zhang and Yongzhe Chang and Haobo Fu and QIANG FU and Xueqian Wang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=E2vL12SwO1}\n}", "github": "", "project": "", "reviewers": "Bcvj;dKrr;rtsy;WFDJ", "site": "https://openreview.net/forum?id=E2vL12SwO1", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;5;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2578876904830030627&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1;1;0;1", "aff_unique_norm": "Tsinghua University;Tencent", "aff_unique_dep": ";Tencent AI Lab", "aff_unique_url": "https://www.tsinghua.edu.cn;https://ai.tencent.com", "aff_unique_abbr": "THU;Tencent AI Lab", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "ES_TOp4YJeD", "title": "ADU-Depth: Attention-based Distillation with Uncertainty Modeling for Depth Estimation", "track": "main", "status": "Poster", "tldr": "The top performer for monocular depth estimation to date is based on the proposed novel distillation scheme and ranked 1st on the challenging KITTI online benchmark.", "abstract": "Monocular depth estimation is challenging due to its inherent ambiguity and ill-posed nature, yet it is quite important to many applications. While recent works achieve limited accuracy by designing increasingly complicated networks to extract features with limited spatial geometric cues from a single RGB image, we intend to introduce spatial cues by training a teacher network that leverages left-right image pairs as inputs and transferring the learned 3D geometry-aware knowledge to the monocular student network. Specifically, we present a novel knowledge distillation framework, named ADU-Depth, with the goal of leveraging the well-trained teacher network to guide the learning of the student network, thus boosting the precise depth estimation with the help of extra spatial scene information. To enable domain adaptation and ensure effective and smooth knowledge transfer from teacher to student, we apply both attention-adapted feature distillation and focal-depth-adapted response distillation in the training stage. In addition, we explicitly model the uncertainty of depth estimation to guide distillation in both feature space and result space to better produce 3D-aware knowledge from monocular observations and thus enhance the learning for hard-to-predict image regions. Our extensive experiments on the real depth estimation datasets KITTI and DrivingStereo demonstrate the effectiveness of the proposed method, which ranked 1st on the challenging KITTI online benchmark.", "keywords": "Monocular depth estimation;Camera perception;Distillation", "primary_area": "", "supplementary_material": "/attachment/981e014da83d15961a278eb15d1bd511a2a9194c.zip", "author": "ZiZhang Wu;Zhuozheng Li;Zhi-Gang Fan;Yunzhe Wu;Xiaoquan Wang;Rui Tang;Jian Pu", "authorids": "~ZiZhang_Wu1;~Zhuozheng_Li1;~Zhi-Gang_Fan3;~Yunzhe_Wu1;~Xiaoquan_Wang2;rui.tang@zongmutech.com;~Jian_Pu1", "gender": "M;M;;M;M;;M", "homepage": ";https://scholar.google.com/citations?hl=zh-TW&user=DflbzX0AAAAJ;;https://blog.csdn.net/Nelson0000?spm=1000.2115.3001.5343;https://www.linkedin.com/in/xiaoquan-wang-780799118/;;", "dblp": "57/11182.html;64/8135;;334/7691;221/3148;;43/6295", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;https://scholar.google.com/citations?hl=zh-TW;;0qUPBwsAAAAJ;;;9pUCoOkAAAAJ", "orcid": "0000-0002-2169-8271;0000-0002-3024-4095;;;;;", "linkedin": ";;;;xiaoquan-wang-780799118/;;", "or_profile": "~ZiZhang_Wu1;~Zhuozheng_Li1;~Zhi-Gang_Fan3;~Yunzhe_Wu1;~Xiaoquan_Wang2;rui.tang@zongmutech.com;~Jian_Pu1", "aff": "Fudan University;ZongMu Technology Co.,Ltd.;;ZongmuTech;Zongmutech;;Fudan University", "aff_domain": "fudan.edu.cn;zongmutech.com;;zongmutech.ac.uk;zongmutech.com;;fudan.edu.cn", "position": "PhD student;Researcher;;Researcher;Researcher;;Associate Professor", "bibtex": "@inproceedings{\nwu2023adudepth,\ntitle={{ADU}-Depth: Attention-based Distillation with Uncertainty Modeling for Depth Estimation},\nauthor={ZiZhang Wu and Zhuozheng Li and Zhi-Gang Fan and Yunzhe Wu and Xiaoquan Wang and Rui Tang and Jian Pu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ES_TOp4YJeD}\n}", "github": "", "project": "", "reviewers": "v8s2;nhhw;d9wF", "site": "https://openreview.net/forum?id=ES_TOp4YJeD", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=569962559357252873&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2;0", "aff_unique_norm": "Fudan University;Zongmu Technology;ZongmuTech", "aff_unique_dep": ";;", "aff_unique_url": "https://www.fudan.edu.cn;;https://www.zongmutech.com", "aff_unique_abbr": "Fudan;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China;" }, { "id": "EXQ0eXtX3OW", "title": "Dexterity from Touch: Self-Supervised Pre-Training of Tactile Representations with Robotic Play", "track": "main", "status": "Poster", "tldr": "Tactile pretraining from play data for dexterous manipulation.", "abstract": "Teaching dexterity to multi-fingered robots has been a longstanding challenge in robotics. Most prominent work in this area focuses on learning controllers or policies that either operate on visual observations or state estimates derived from vision. However, such methods perform poorly on fine-grained manipulation tasks that require reasoning about contact forces or about objects occluded by the hand itself. In this work, we present T-Dex, a new approach for tactile-based dexterity, that operates in two phases. In the first phase, we collect 2.5 hours of play data, which is used to train self-supervised tactile encoders. This is necessary to bring high-dimensional tactile readings to a lower-dimensional embedding. In the second phase, given a handful of demonstrations for a dexterous task, we learn non-parametric policies that combine the tactile observations with visual ones. Across five challenging dexterous tasks, we show that our tactile-based dexterity models outperform purely vision and torque-based models by an average of 1.7X.\nFinally, we provide a detailed analysis on factors critical to T-Dex including the importance of play data, architectures, and representation learning.", "keywords": "Tactile;Dexterity;Manipulation", "primary_area": "", "supplementary_material": "/attachment/2945c9a73f3b5794a0e7edb3890b027355abf0fd.zip", "author": "Irmak Guzey;Ben Evans;Soumith Chintala;Lerrel Pinto", "authorids": "~Irmak_Guzey1;~Ben_Evans1;~Soumith_Chintala1;~Lerrel_Pinto1", "gender": "F;;M;M", "homepage": "https://irmakguzey.github.io/;;https://www.lerrelpinto.com/;https://bennevans.github.io/", "dblp": ";http://dblp.uni-trier.de/pers/hd/c/Chintala:Soumith;168/8304;87/9175", "google_scholar": "0FEl834AAAAJ;36ofBJgAAAAJ;pmVPj94AAAAJ;JPQom2sAAAAJ", "orcid": ";;;", "linkedin": ";;;bnevans/", "or_profile": "~Irmak_Guzey1;~Soumith_Chintala1;~Lerrel_Pinto1;~Benjamin_Evans1", "aff": "New York University;Meta Facebook;New York University;New York University", "aff_domain": "nyu.edu;fb.com;cs.nyu.edu;nyu.edu", "position": "MS student;Researcher;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nguzey2023dexterity,\ntitle={Dexterity from Touch: Self-Supervised Pre-Training of Tactile Representations with Robotic Play},\nauthor={Irmak Guzey and Ben Evans and Soumith Chintala and Lerrel Pinto},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=EXQ0eXtX3OW}\n}", "github": "https://github.com/irmakguzey/tactile-dexterity", "project": "", "reviewers": "x3zE;rRsP;27e9;Tv7A", "site": "https://openreview.net/forum?id=EXQ0eXtX3OW", "pdf_size": 0, "rating": "1;6;10;10", "confidence": "4;4;5;5", "rating_avg": 6.75, "confidence_avg": 4.5, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.8784585919193317, "gs_citation": 64, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=147780047725521829&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "New York University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.nyu.edu;https://meta.com", "aff_unique_abbr": "NYU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "EvuAJ0wD98", "title": "Intent-Aware Planning in Heterogeneous Traffic via Distributed Multi-Agent Reinforcement Learning", "track": "main", "status": "Oral", "tldr": "We combine opponent modeling, classical trajectory forecasting, and distributed MARL and show good performance both in avg. episodic reward and success rate on the (heterogeneous versions of) Highway-env and non-cooperative MPE.", "abstract": "Navigating safely and efficiently in dense and heterogeneous traffic scenarios is challenging for autonomous vehicles (AVs) due to their inability to infer the behaviors or intentions of nearby drivers. In this work, we introduce a distributed multi-agent reinforcement learning (MARL) algorithm for joint trajectory and intent prediction for autonomous vehicles in dense and heterogeneous environments. Our approach for intent-aware planning, iPLAN, allows agents to infer nearby drivers' intents solely from their local observations. We model an explicit representation of agents' private incentives: Behavioral Incentive for high-level decision-making strategy that sets planning sub-goals and Instant Incentive for low-level motion planning to execute sub-goals. Our approach enables agents to infer their opponents' behavior incentives and integrate this inferred information into their decision-making and motion-planning processes. We perform experiments on two simulation environments, Non-Cooperative Navigation and Heterogeneous Highway. In Heterogeneous Highway, results show that, compared with centralized training decentralized execution (CTDE) MARL baselines such as QMIX and MAPPO, our method yields a $4.3\\%$ and $38.4\\%$ higher episodic reward in mild and chaotic traffic, with $48.1\\%$ higher success rate and $80.6\\%$ longer survival time in chaotic traffic. We also compare with a decentralized training decentralized execution (DTDE) baseline IPPO and demonstrate a higher episodic reward of $12.7\\%$ and $6.3\\%$ in mild traffic and chaotic traffic, $25.3\\%$ higher success rate, and $13.7\\%$ longer survival time.", "keywords": "Autonomous Driving;Multi-agent Reinforcement Learning;Representation Learning", "primary_area": "", "supplementary_material": "/attachment/d49e3fbe023bed6a61172725d1adb8c729fd319a.zip", "author": "Xiyang Wu;Rohan Chandra;Tianrui Guan;Amrit Bedi;Dinesh Manocha", "authorids": "~Xiyang_Wu1;~Rohan_Chandra1;~Tianrui_Guan1;~Amrit_Bedi1;~Dinesh_Manocha3", "gender": "M;M;M;M;M", "homepage": "https://wuxiyang1996.github.io/;http://rohanchandra30.github.io/;https://rayguan97.github.io/;https://sites.google.com/view/amritsinghbedi/home;https://www.cs.umd.edu/people/dmanocha", "dblp": "277/9448;210/2278;255/5204;176/2707.html;m/DineshManocha", "google_scholar": "sI05dqQAAAAJ;uOIgTt8AAAAJ;_7mX21UAAAAJ;91WLA6QAAAAJ;X08l_4IAAAAJ", "orcid": "0000-0001-8538-8267;0000-0003-4843-6375;;;0000-0001-7047-9801", "linkedin": "xiyang-wu-0b120614b/;rohanchandra30/;tianrui-guan-b76733ba/;;dinesh-manocha-2311846", "or_profile": "~Xiyang_Wu1;~Rohan_Chandra1;~Tianrui_Guan1;~Amrit_Bedi1;~Dinesh_Manocha3", "aff": "University of Maryland, College Park;University of Texas at Austin;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;utexas.edu;cs.umd.edu;umd.edu;umd.edu", "position": "PhD student;Postdoc;PhD student;Researcher;Professor", "bibtex": "@inproceedings{\nwu2023intentaware,\ntitle={Intent-Aware Planning in Heterogeneous Traffic via Distributed Multi-Agent Reinforcement Learning},\nauthor={Xiyang Wu and Rohan Chandra and Tianrui Guan and Amrit Bedi and Dinesh Manocha},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=EvuAJ0wD98}\n}", "github": "", "project": "", "reviewers": "qQsj;htJq;vQKv", "site": "https://openreview.net/forum?id=EvuAJ0wD98", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;4;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 54, "authors#_avg": 5, "corr_rating_confidence": -1.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14194540467100762647&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;0", "aff_unique_norm": "University of Maryland;University of Texas at Austin;University of Maryland, College Park", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www.utexas.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UT Austin;UMD", "aff_campus_unique_index": "0;1;0;0;0", "aff_campus_unique": "College Park;Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Eyb4e3GBuuR", "title": "Energy-based Potential Games for Joint Motion Forecasting and Control", "track": "main", "status": "Poster", "tldr": "A connection between differential games, optimal control, and energy-based models and an application for multi-agent motion forecasting and control, combining neural networks with differentiable game-theoretic optimization.", "abstract": "This work uses game theory as a mathematical framework to address interaction modeling in multi-agent motion forecasting and control. Despite its interpretability, applying game theory to real-world robotics, like automated driving, faces challenges such as unknown game parameters. To tackle these, we establish a connection between differential games, optimal control, and energy-based models, demonstrating how existing approaches can be unified under our proposed Energy-based Potential Game formulation. Building upon this, we introduce a new end-to-end learning application that combines neural networks for game-parameter inference with a differentiable game-theoretic optimization layer, acting as an inductive bias. The analysis provides empirical evidence that the game-theoretic layer adds interpretability and improves the predictive performance of various neural network backbones using two simulations and two real-world driving datasets.", "keywords": "Trajectory Prediction;Multi-Agent Interaction;Game-Theoretic Motion Planning;Energy-based Model;Optimal Control;Autonomous Vehicles", "primary_area": "", "supplementary_material": "/attachment/5eea748f6b2a5a022e990f647a969561ed3811f7.zip", "author": "Christopher Diehl;Tobias Klosek;Martin Krueger;Nils Murzyn;Timo Osterburg;Torsten Bertram", "authorids": "~Christopher_Diehl1;~Tobias_Klosek1;~Martin_Krueger1;~Nils_Murzyn1;~Timo_Osterburg1;~Torsten_Bertram1", "gender": ";M;M;M;M;", "homepage": ";https://rst.etit.tu-dortmund.de/lehrstuhl/team/klosek/;https://rst.etit.tu-dortmund.de/lehrstuhl/team/krueger/;https://www.linkedin.com/in/nils-murzyn/;https://de.linkedin.com/in/timo-osterburg-a3142817a;", "dblp": "272/4414;;39/784.html;;;", "google_scholar": "8HsbmCMAAAAJ;;https://scholar.google.de/citations?user=LA8gmTIAAAAJ;;https://scholar.google.de/citations?user=vl_7Cm8AAAAJ;", "orcid": ";;0000-0003-0544-0331;;0000-0001-7637-0700;", "linkedin": ";;https://de.linkedin.com/in/martin-kr%C3%BCger-686252a6;;;", "or_profile": "~Christopher_Diehl1;~Tobias_Klosek1;~Martin_Krueger1;~Nils_Murzyn1;~Timo_Osterburg1;~Torsten_Bertram1", "aff": "TU Dortmund University;Technische Universit\u00e4t Dortmund;TU Dortmund University;;Technische Universit\u00e4t Dortmund;", "aff_domain": "tu-dortmund.de;tu-dortmund.de;tu-dortmund.de;;tu-dortmund.de;", "position": "PhD student;MS student;PhD student;;PhD student;", "bibtex": "@inproceedings{\ndiehl2023energybased,\ntitle={Energy-based Potential Games for Joint Motion Forecasting and Control},\nauthor={Christopher Diehl and Tobias Klosek and Martin Krueger and Nils Murzyn and Timo Osterburg and Torsten Bertram},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Eyb4e3GBuuR}\n}", "github": "https://github.com/rst-tu-dortmund/diff_epo_planner", "project": "", "reviewers": "xj4d;ZfDq;LVGe", "site": "https://openreview.net/forum?id=Eyb4e3GBuuR", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 22, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13788918585781948738&as_sdt=40005&sciodt=0,10&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Dortmund", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-dortmund.de", "aff_unique_abbr": "TU Dortmund", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Dortmund;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "FRKBdXhkQE0", "title": "FastRLAP: A System for Learning High-Speed Driving via Deep RL and Autonomous Practicing", "track": "main", "status": "Poster", "tldr": "Training a task-specific vision encoder (using offline RL to learn representations) enables online learning from pixels (without simulation) in under 20 minutes.", "abstract": "We present a system that enables an autonomous small-scale RC car to drive aggressively from visual observations using reinforcement learning (RL). Our system, FastRLAP, trains autonomously in the real world, without human interventions, and without requiring any simulation or expert demonstrations. Our system integrates a number of important components to make this possible: we initialize the representations for the RL policy and value function from a large prior dataset of other robots navigating in other environments (at low speed), which provides a navigation-relevant representation. From here, a sample-efficient online RL method uses a single low-speed user-provided demonstration to determine the desired driving course, extracts a set of navigational checkpoints, and autonomously practices driving through these checkpoints, resetting automatically on collision or failure. Perhaps surprisingly, we find that with appropriate initialization and choice of algorithm, our system can learn to drive over a variety of racing courses with less than 20 minutes of online training. The resulting policies exhibit emergent aggressive driving skills, such as timing braking and acceleration around turns and avoiding areas which impede the robot's motion, approaching the performance of a human driver using a similar first-person interface over the course of training.", "keywords": "reinforcement learning;offroad driving;vision-based navigation", "primary_area": "", "supplementary_material": "/attachment/fe597fa0c5342ad086c395d0c2aa19294e8327b1.zip", "author": "Kyle Stachowicz;Dhruv Shah;Arjun Bhorkar;Ilya Kostrikov;Sergey Levine", "authorids": "~Kyle_Stachowicz1;~Dhruv_Shah1;~Arjun_Bhorkar1;~Ilya_Kostrikov1;~Sergey_Levine1", "gender": "M;M;;M;M", "homepage": "https://kylesta.ch;http://cs.berkeley.edu/~shah;;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";;;https://dblp.org/pers/k/Kostrikov:Ilya.html;80/7594", "google_scholar": ";;;PTS2AOgAAAAJ;8R35rCwAAAAJ", "orcid": ";;;;", "linkedin": ";;arjun-bhorkar-306166160/;;", "or_profile": "~Kyle_Stachowicz1;~Dhruv_Shah1;~Arjun_Bhorkar1;~Ilya_Kostrikov1;~Sergey_Levine1", "aff": "University of California, Berkeley;UC Berkeley;University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;google.com", "position": "PhD student;PhD student;Undergrad student;Postdoc;Research Scientist", "bibtex": "@inproceedings{\nstachowicz2023fastrlap,\ntitle={Fast{RLAP}: A System for Learning High-Speed Driving via Deep {RL} and Autonomous Practicing},\nauthor={Kyle Stachowicz and Dhruv Shah and Arjun Bhorkar and Ilya Kostrikov and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=FRKBdXhkQE0}\n}", "github": "github.com/kylestach/fastrlap-release", "project": "", "reviewers": "q79b;gJxi;ieSQ;1JBu", "site": "https://openreview.net/forum?id=FRKBdXhkQE0", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;5;3", "rating_avg": 7.0, "confidence_avg": 3.75, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.5222329678670935, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12745131671236507636&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "FefFLN5FvIM", "title": "TactileVAD: Geometric Aliasing-Aware Dynamics for High-Resolution Tactile Control", "track": "main", "status": "Poster", "tldr": "Our paper introduces TactileVAD, a decoder-only control method that resolves tactile geometric aliasing, improving performance and reliability in touch-based manipulation across various tactile sensors.", "abstract": "Touch-based control is a promising approach to dexterous manipulation. However, existing tactile control methods often overlook tactile geometric aliasing which can compromise control performance and reliability. This type of aliasing occurs when different contact locations yield similar tactile signatures. To address this, we propose TactileVAD, a generative decoder-only linear latent dynamics formulation compatible with standard control methods that is capable of resolving geometric aliasing. We evaluate TactileVAD on two mechanically-distinct tactile sensors, SoftBubbles (pointcloud data) and Gelslim 3.0 (RGB data), showcasing its effectiveness in handling different sensing modalities. Additionally, we introduce the tactile cartpole, a novel benchmarking setup to evaluate the ability of a control method to respond to disturbances based on tactile input. Evaluations comparing TactileVAD to baselines suggest that our method is better able to achieve goal tactile configurations and hand poses.", "keywords": "Manipulation;tactile control;high-resolution tactile sensors", "primary_area": "", "supplementary_material": "/attachment/4205e273330e7d6d7f7035dcb1498fa1b53862a7.zip", "author": "Miquel Oller;Dmitry Berenson;Nima Fazeli", "authorids": "~Miquel_Oller1;~Dmitry_Berenson1;~Nima_Fazeli1", "gender": ";M;", "homepage": ";http://web.eecs.umich.edu/~dmitryb/;https://www.mmintlab.com", "dblp": ";;", "google_scholar": "N8LKz0kAAAAJ;x-n9rIMAAAAJ;", "orcid": ";0000-0002-9712-109X;", "linkedin": ";;", "or_profile": "~Miquel_Oller1;~Dmitry_Berenson1;~Nima_Fazeli1", "aff": "University of Michigan - Ann Arbor;University of Michigan;University of Michigan", "aff_domain": "umich.edu;umich.edu;umich.edu", "position": "PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\noller2023tactilevad,\ntitle={Tactile{VAD}: Geometric Aliasing-Aware Dynamics for High-Resolution Tactile Control},\nauthor={Miquel Oller and Dmitry Berenson and Nima Fazeli},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=FefFLN5FvIM}\n}", "github": "", "project": "", "reviewers": "rJSx;bqGm;pV8K", "site": "https://openreview.net/forum?id=FefFLN5FvIM", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;5", "rating_avg": 6.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17216125413571213635&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Ffn8Z4Q-zU", "title": "Gesture-Informed Robot Assistance via Foundation Models", "track": "main", "status": "Poster", "tldr": "We propose a framework, GIRAF, for more flexibly interpreting human gesture and language instructions by leveraging the power of large language models.", "abstract": "Gestures serve as a fundamental and significant mode of non-verbal communication among humans. Deictic gestures (such as pointing towards an object), in particular, offer valuable means of efficiently expressing intent in situations where language is inaccessible, restricted, or highly specialized. As a result, it is essential for robots to comprehend gestures in order to infer human intentions and establish more effective coordination with them. Prior work often rely on a rigid hand-coded library of gestures along with their meanings. However, interpretation of gestures is often context-dependent, requiring more flexibility and common-sense reasoning. In this work, we propose a framework, GIRAF, for more flexibly interpreting gesture and language instructions by leveraging the power of large language models. Our framework is able to accurately infer human intent and contextualize the meaning of their gestures for more effective human-robot collaboration. We instantiate the framework for three table-top manipulation tasks and demonstrate that it is both effective and preferred by users. We further demonstrate GIRAF\u2019s ability on reasoning about diverse types of gestures by curating a GestureInstruct dataset consisting of 36 different task scenarios. GIRAF achieved 81% success rate on finding the correct plan for tasks in GestureInstruct.\nVideos and datasets can be found on our project website: https://tinyurl.com/giraf23", "keywords": "Planning with Gestures;Human-Robot Interaction;LLM Reasoning", "primary_area": "", "supplementary_material": "", "author": "Li-Heng Lin;Yuchen Cui;Yilun Hao;Fei Xia;Dorsa Sadigh", "authorids": "~Li-Heng_Lin1;~Yuchen_Cui1;~Yilun_Hao1;~Fei_Xia1;~Dorsa_Sadigh1", "gender": "M;F;;M;F", "homepage": "https://lihenglin.github.io;https://yuchencui.cc;https://yih301.github.io;;https://dorsa.fyi/", "dblp": ";201/5416.html;285/4024;;117/3174", "google_scholar": "https://scholar.google.com/citations?hl=en;qQz2cm8AAAAJ;RjQF17YAAAAJ;pqP5_PgAAAAJ;ZaJEZpYAAAAJ", "orcid": ";0000-0001-7417-1222;;0000-0003-4343-1444;", "linkedin": ";;yilun-hao-86554a178/;;", "or_profile": "~Li-Heng_Lin1;~Yuchen_Cui1;~Yilun_Hao1;~Fei_Xia1;~Dorsa_Sadigh1", "aff": "Stanford University;Stanford University;Stanford University;Google;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;google.com;stanford.edu", "position": "MS student;Postdoc;MS student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nlin2023gestureinformed,\ntitle={Gesture-Informed Robot Assistance via Foundation Models},\nauthor={Li-Heng Lin and Yuchen Cui and Yilun Hao and Fei Xia and Dorsa Sadigh},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Ffn8Z4Q-zU}\n}", "github": "", "project": "", "reviewers": "h9cL;zpDJ;i78B;beCJ", "site": "https://openreview.net/forum?id=Ffn8Z4Q-zU", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;4", "rating_avg": 6.5, "confidence_avg": 4.0, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7128226555528807149&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;0;1;0", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "G_FEL3OkiR", "title": "Human-in-the-Loop Task and Motion Planning for Imitation Learning", "track": "main", "status": "Poster", "tldr": "We present Human-in-the-Loop Task and Motion Planning (HITL-TAMP), a novel system that selectively gives and takes control to and from a human teleoperator, enabling more efficient imitation learning.", "abstract": "Imitation learning from human demonstrations can teach robots complex manipulation skills, but is time-consuming and labor intensive. In contrast, Task and Motion Planning (TAMP) systems are automated and excel at solving long-horizon tasks, but they are difficult to apply to contact-rich tasks. In this paper, we present Human-in-the-Loop Task and Motion Planning (HITL-TAMP), a novel system that leverages the benefits of both approaches. The system employs a TAMP-gated control mechanism, which selectively gives and takes control to and from a human teleoperator. This enables the human teleoperator to manage a fleet of robots, maximizing data collection efficiency. The collected human data is then combined with an imitation learning framework to train a TAMP-gated policy, leading to superior performance compared to training on full task demonstrations. We compared HITL-TAMP to a conventional teleoperation system --- users gathered more than 3x the number of demos given the same time budget. Furthermore, proficient agents (75\\%+ success) could be trained from just 10 minutes of non-expert teleoperation data. Finally, we collected 2.1K demos with HITL-TAMP across 12 contact-rich, long-horizon tasks and show that the system often produces near-perfect agents. Videos and additional results at https://hitltamp.github.io .", "keywords": "Imitation Learning;Task and Motion Planning;Teleoperation", "primary_area": "", "supplementary_material": "/attachment/8ee56da2a440e612ec4855d954bc3d8b321a2754.zip", "author": "Ajay Mandlekar;Caelan Reed Garrett;Danfei Xu;Dieter Fox", "authorids": "~Ajay_Mandlekar1;~Caelan_Reed_Garrett1;~Danfei_Xu1;~Dieter_Fox1", "gender": "M;M;M;M", "homepage": "https://ai.stanford.edu/~amandlek/;http://web.mit.edu/caelan/www/;https://cs.stanford.edu/~danfei/;https://homes.cs.washington.edu/~fox/", "dblp": "https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay;161/9727;135/8443;f/DieterFox", "google_scholar": "MEz23joAAAAJ;KVUCqGwAAAAJ;J5D4kcoAAAAJ;DqXsbPAAAAAJ", "orcid": ";0000-0002-6474-1276;;", "linkedin": ";caelan-garrett-85197977/;;", "or_profile": "~Ajay_Mandlekar1;~Caelan_Reed_Garrett1;~Danfei_Xu1;~Dieter_Fox1", "aff": "NVIDIA;NVIDIA;NVIDIA;Department of Computer Science", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;cs.washington.edu", "position": "Researcher;Researcher;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nmandlekar2023humanintheloop,\ntitle={Human-in-the-Loop Task and Motion Planning for Imitation Learning},\nauthor={Ajay Mandlekar and Caelan Reed Garrett and Danfei Xu and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=G_FEL3OkiR}\n}", "github": "", "project": "", "reviewers": "usXb;LDnh;KfBx;SKPE", "site": "https://openreview.net/forum?id=G_FEL3OkiR", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "5;3;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3828126774615962405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "NVIDIA;Unknown Institution", "aff_unique_dep": "NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;", "aff_unique_abbr": "NVIDIA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "GsM2qJTAg-", "title": "Heteroscedastic Gaussian Processes and Random Features: Scalable Motion Primitives with Guarantees", "track": "main", "status": "Poster", "tldr": "Heteroscedastic Gaussian processes and random features provide a scalable and provably accurate motion primitive for learning from demonstration.", "abstract": "Heteroscedastic Gaussian processes (HGPs) are kernel-based, non-parametric models that can be used to infer nonlinear functions with time-varying noise. In robotics, they can be employed for learning from demonstration as motion primitives, i.e. as a model of the trajectories to be executed by the robot. HGPs provide variance estimates around the reference signal modeling the trajectory, capturing both the predictive uncertainty and the motion variability. However, similarly to standard Gaussian processes they suffer from a cubic complexity in the number of training points, due to the inversion of the kernel matrix. The uncertainty can be leveraged for more complex learning tasks, such as inferring the variable impedance profile required from a robotic manipulator. However, suitable approximations are needed to make HGPs scalable, at the price of potentially worsening the posterior mean and variance profiles. Motivated by these observations, we study the combination of HGPs and random features, which are a popular, data-independent approximation strategy of kernel functions. In a theoretical analysis, we provide novel guarantees on the approximation error of the HGP posterior due to random features. Moreover, we validate this scalable motion primitive on real robot data, related to the problem of variable impedance learning. In this way, we show that random features offer a viable and theoretically sound alternative for speeding up the trajectory processing, without sacrificing accuracy.", "keywords": "Gaussian process regression;random features;motion primitives", "primary_area": "", "supplementary_material": "/attachment/8e19656b8cb73a4f6e13d55faf503063de4e45ad.zip", "author": "Edoardo Caldarelli;Antoine Chatalic;Adri\u00e0 Colom\u00e9;Lorenzo Rosasco;Carme Torras", "authorids": "~Edoardo_Caldarelli1;~Antoine_Chatalic1;~Adri\u00e0_Colom\u00e91;~Lorenzo_Rosasco1;~Carme_Torras1", "gender": "M;;M;;F", "homepage": "https://www.iri.upc.edu/staff/ecaldarelli;;;;http://www.iri.upc.edu/people/torras/", "dblp": ";;123/6351.html;;", "google_scholar": "JVRCLkgAAAAJ;;Cdsw9j8AAAAJ;;", "orcid": ";;0000-0001-9715-4062;;", "linkedin": "edoardo-caldarelli-559449155/?locale=en_US;;;;", "or_profile": "~Edoardo_Caldarelli1;~Antoine_Chatalic1;~Adri\u00e0_Colom\u00e91;~Lorenzo_Rosasco1;~Carme_Torras1", "aff": "Universidad Polit\u00e9cnica de Cataluna;;Spanish National Research Council;;Institut de Rob\u00f2tica i Inform\u00e0tica Industrial, CSIC-UPC", "aff_domain": "upc.edu;;csic.es;;iri.upc.edu", "position": "PhD student;;Postdoc;;Full Professor", "bibtex": "@inproceedings{\ncaldarelli2023heteroscedastic,\ntitle={Heteroscedastic Gaussian Processes and Random Features: Scalable Motion Primitives with Guarantees},\nauthor={Edoardo Caldarelli and Antoine Chatalic and Adri{\\`a} Colom{\\'e} and Lorenzo Rosasco and Carme Torras},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=GsM2qJTAg-}\n}", "github": "https://github.com/LCSL/rff-hgp", "project": "", "reviewers": "JwFW;qoam;zk8k;kgu9", "site": "https://openreview.net/forum?id=GsM2qJTAg-", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "5;4;1;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.5773502691896258, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:mz70n2gYWbQJ:scholar.google.com/&scioq=Heteroscedastic+Gaussian+Processes+and+Random+Features:+Scalable+Motion+Primitives+with+Guarantees&hl=en&as_sdt=0,33", "gs_version_total": 6, "aff_unique_index": "0;1;2", "aff_unique_norm": "Universitat Polit\u00e8cnica de Catalunya;Spanish National Research Council;Institut de Rob\u00f2tica i Inform\u00e0tica Industrial", "aff_unique_dep": ";;CSIC-UPC", "aff_unique_url": "https://www.upc.edu;https://www.csic.es;https://www.iri.upc.edu/", "aff_unique_abbr": "UPC;CSIC;IRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Spain" }, { "id": "HCWoFkGe8L4", "title": "Revisiting Depth-guided Methods for Monocular 3D Object Detection by Hierarchical Balanced Depth", "track": "main", "status": "Poster", "tldr": "", "abstract": "Monocular 3D object detection has seen significant advancements with the incorporation of depth information. However, there remains a considerable performance gap compared to LiDAR-based methods, largely due to inaccurate depth estimation. We argue that this issue stems from the commonly used pixel-wise depth map loss, which inherently creates the imbalance of loss weighting between near and distant objects. To address these challenges, we propose MonoHBD (Monocular Hierarchical Balanced Depth), a comprehensive solution with the hierarchical mechanism. We introduce the Hierarchical Depth Map (HDM) structure that incorporates depth bins and depth offsets to enhance the localization accuracy for objects. Leveraging RoIAlign, our Balanced Depth Extractor (BDE) module captures both scene-level depth relationships and object-specific depth characteristics while considering the geometry properties through the inclusion of camera calibration parameters. Furthermore, we propose a novel depth map loss that regularizes object-level depth features to mitigate imbalanced loss propagation. Our model reaches state-of-the-art results on the KITTI 3D object detection benchmark while supporting real-time detection. Excessive ablation studies are also conducted to prove the efficacy of our proposed modules.", "keywords": "monocular 3D object detection;autonomous driving", "primary_area": "", "supplementary_material": "/attachment/98d645557ae9327d14c990c1e7c60f0fae7c4f67.zip", "author": "Yi-Rong Chen;Ching-Yu Tseng;Yi-Syuan Liou;Tsung-Han Wu;Winston H. Hsu", "authorids": "~Yi-Rong_Chen1;~Ching-Yu_Tseng1;~Yi-Syuan_Liou1;~Tsung-Han_Wu1;~Winston_H._Hsu2", "gender": "M;M;M;M;", "homepage": "http://www.cmlab.csie.ntu.edu.tw/~andy94077/;https://sty61010.github.io/;https://tsunghan-wu.github.io;https://winstonhsu.info/;", "dblp": ";;01/6790;16/5668.html;", "google_scholar": ";FcmRnIQAAAAJ;https://scholar.google.com.tw/citations?user=ykuVSuEAAAAJ;https://scholar.google.com.tw/citations?user=NOvDH3QAAAAJ;", "orcid": ";;;0000-0002-3330-0638;", "linkedin": ";ching-yu-tseng-58000018b/;tsunghanwu/;;yisyuanliou/", "or_profile": "~Yi-Rong_Chen1;~Ching-Yu_Tseng1;~Tsung-Han_Wu1;~Winston_Hsu1;~Yi_Syuan_Liou1", "aff": "Department of computer science and informational engineering, National Taiwan University;National Taiwan University;National Taiwan University;National Taiwan University;National Taiwan University", "aff_domain": "csie.ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw;ntu.edu", "position": "MS student;MS student;Researcher;Professor;MS student", "bibtex": "@inproceedings{\nchen2023revisiting,\ntitle={Revisiting Depth-guided Methods for Monocular 3D Object Detection by Hierarchical Balanced Depth},\nauthor={Yi-Rong Chen and Ching-Yu Tseng and Yi-Syuan Liou and Tsung-Han Wu and Winston H. Hsu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=HCWoFkGe8L4}\n}", "github": "", "project": "", "reviewers": "BqXK;yA3D;shF4", "site": "https://openreview.net/forum?id=HCWoFkGe8L4", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;5;5", "rating_avg": 6.0, "confidence_avg": 5.0, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5642159654322031905&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "National Taiwan University", "aff_unique_dep": "Department of Computer Science and Informational Engineering", "aff_unique_url": "https://www.ntu.edu.tw", "aff_unique_abbr": "NTU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "HDYMjiukjn", "title": "RoboPianist: Dexterous Piano Playing with Deep Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "We train anthropomorphic robot hands to play the piano using deep RL and release a simulated benchmark and dataset to advance high-dimensional control.", "abstract": "Replicating human-like dexterity in robot hands represents one of the largest open problems in robotics. Reinforcement learning is a promising approach that has achieved impressive progress in the last few years; however, the class of problems it has typically addressed corresponds to a rather narrow definition of dexterity as compared to human capabilities. To address this gap, we investigate piano-playing, a skill that challenges even the human limits of dexterity, as a means to test high-dimensional control, and which requires high spatial and temporal precision, and complex finger coordination and planning. We introduce RoboPianist, a system that enables simulated anthropomorphic hands to learn an extensive repertoire of 150 piano pieces where traditional model-based optimization struggles. We additionally introduce an open-sourced environment, benchmark of tasks, interpretable evaluation metrics, and open challenges for future study. Our website featuring videos, code, and datasets is available at https://kzakka.com/robopianist/", "keywords": "high-dimensional control;bi-manual dexterity", "primary_area": "", "supplementary_material": "/attachment/0f970bc2363c536f666a0ff3d2936331f6a7b11c.zip", "author": "Kevin Zakka;Philipp Wu;Laura Smith;Nimrod Gileadi;Taylor Howell;Xue Bin Peng;Sumeet Singh;Yuval Tassa;Pete Florence;Andy Zeng;Pieter Abbeel", "authorids": "~Kevin_Zakka1;~Philipp_Wu1;~Laura_Smith1;~Nimrod_Gileadi1;~Taylor_Howell1;~Xue_Bin_Peng1;~Sumeet_Singh3;~Yuval_Tassa2;~Pete_Florence1;~Andy_Zeng3;~Pieter_Abbeel2", "gender": "M;M;F;M;;M;M;;M;M;M", "homepage": "https://kzakka.com/;https://github.com/wuphilipp;;;https://thowell.github.io/;https://xbpeng.github.io;;http://www.peteflorence.com/;https://people.eecs.berkeley.edu/~pabbeel/;http://andyzeng.github.io/;", "dblp": ";;54/11024;;;;;;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy;20/4415", "google_scholar": "8qHnRnsAAAAJ;;;snHVatUAAAAJ;;https://scholar.google.ca/citations?user=FwxfQosAAAAJ;ZGpE5cYAAAAJ;;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;q7nFtUcAAAAJ;https://scholar.google.co.uk/citations?user=CjOTm_4AAAAJ", "orcid": ";;;;;;;;;;", "linkedin": ";;;nimrod-gileadi-6669b422;;;;;;;", "or_profile": "~Kevin_Zakka1;~Philipp_Wu1;~Laura_Smith1;~Nimrod_Gileadi1;~Taylor_Howell1;~Xue_Bin_Peng1;~Sumeet_Singh3;~Pete_Florence1;~Pieter_Abbeel2;~Andy_Zeng1;~yuval_tassa1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google DeepMind;Stanford University;Simon Fraser University;Google Brain Robotics;Google;Covariant;Google;Google", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;deepmind.com;stanford.edu;sfu.ca;google.com;google.com;covariant.ai;google.com;google.com", "position": "PhD student;PhD student;PhD student;Software Engineer;PhD student;Assistant Professor;Researcher;Research Scientist;Founder;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nzakka2023robopianist,\ntitle={RoboPianist: Dexterous Piano Playing with Deep Reinforcement Learning},\nauthor={Kevin Zakka and Philipp Wu and Laura Smith and Nimrod Gileadi and Taylor Howell and Xue Bin Peng and Sumeet Singh and Yuval Tassa and Pete Florence and Andy Zeng and Pieter Abbeel},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=HDYMjiukjn}\n}", "github": "https://github.com/google-research/robopianist", "project": "", "reviewers": "F4CG;28kq;eFF7;Dnam", "site": "https://openreview.net/forum?id=HDYMjiukjn", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;5", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 20, "authors#_avg": 11, "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7383602115982073881&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2;3;1;1;4;1;1", "aff_unique_norm": "University of California, Berkeley;Google;Stanford University;Simon Fraser University;Covariant", "aff_unique_dep": ";Google DeepMind;;;", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com;https://www.stanford.edu;https://www.sfu.ca;", "aff_unique_abbr": "UC Berkeley;DeepMind;Stanford;SFU;", "aff_campus_unique_index": "0;0;0;2;3;3;3;3", "aff_campus_unique": "Berkeley;;Stanford;Mountain View", "aff_country_unique_index": "0;0;0;1;0;2;0;0;0;0", "aff_country_unique": "United States;United Kingdom;Canada;" }, { "id": "HEIRj51lcS", "title": "Polybot: Training One Policy Across Robots While Embracing Variability", "track": "main", "status": "Poster", "tldr": "We develop CRADLE a method that efficiently learns new tasks using data collected on other robots without constraining the camera viewpoint, embodiment, or low-level controller.", "abstract": "Reusing large datasets is crucial to scale vision-based robotic manipulators to everyday scenarios due to the high cost of collecting robotic datasets. However, robotic platforms possess varying control schemes, camera viewpoints, kinematic configurations, and end-effector morphologies, posing significant challenges when transferring manipulation skills from one platform to another. To tackle this problem, we propose a set of key design decisions to train a single policy for deployment on multiple robotic platforms. Our framework first aligns the observation and action spaces of our policy across embodiments via utilizing wrist cameras and a unified, but modular codebase. To bridge the remaining domain shift, we align our policy's internal representations across embodiments via contrastive learning. We evaluate our method on a dataset collected over 60 hours spanning 6 tasks and 3 robots with varying joint configurations and sizes: the WidowX 250S, Franka Emika Panda, and Sawyer. Our results demonstrate significant improvements in success rate and sample efficiency for our policy when using new task data collected on a different robot, validating our proposed design decisions. More details and videos can be found on our project website: https://sites.google.com/view/cradle-multirobot", "keywords": "vision-based manipulation;multi-robot generalization", "primary_area": "", "supplementary_material": "/attachment/c1cad29b7a802d8f196a75ba16b96161cc69b2ba.zip", "author": "Jonathan Heewon Yang;Dorsa Sadigh;Chelsea Finn", "authorids": "~Jonathan_Heewon_Yang1;~Dorsa_Sadigh1;~Chelsea_Finn1", "gender": "M;F;F", "homepage": ";https://dorsa.fyi/;https://ai.stanford.edu/~cbfinn/", "dblp": ";117/3174;131/1783", "google_scholar": ";ZaJEZpYAAAAJ;vfPE6hgAAAAJ", "orcid": ";;", "linkedin": "jonathan-yang-7b5542124/;;", "or_profile": "~Jonathan_Heewon_Yang1;~Dorsa_Sadigh1;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Google", "aff_domain": "stanford.edu;stanford.edu;google.com", "position": "PhD student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nyang2023polybot,\ntitle={Polybot: Training One Policy Across Robots While Embracing Variability},\nauthor={Jonathan Heewon Yang and Dorsa Sadigh and Chelsea Finn},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=HEIRj51lcS}\n}", "github": "", "project": "", "reviewers": "Fa2w;D1Nx;tpTQ;LHXF", "site": "https://openreview.net/forum?id=HEIRj51lcS", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;5", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 10, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15694518242989920679&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "HYka22IcV6", "title": "Online Learning for Obstacle Avoidance", "track": "main", "status": "Poster", "tldr": "Use online learning methods to derive a regret-minimizing controller for obstacle avoidance tasks.", "abstract": "We approach the fundamental problem of obstacle avoidance for robotic systems via the lens of online learning. In contrast to prior work that either assumes worst-case realizations of uncertainty in the environment or a stationary stochastic model of uncertainty, we propose a method that is efficient to implement and provably grants instance-optimality with respect to perturbations of trajectories generated from an open-loop planner (in the sense of minimizing worst-case regret). The resulting policy adapts online to realizations of uncertainty and provably compares well with the best obstacle avoidance policy in hindsight from a rich class of policies. The method is validated in simulation on a dynamical system environment and compared to baseline open-loop planning and robust Hamilton-Jacobi reachability techniques. Further, it is implemented on a hardware example where a quadruped robot traverses a dense obstacle field and encounters input disturbances due to time delays, model uncertainty, and dynamics nonlinearities.", "keywords": "Regret;Online;Learning;Convex;Optimization;Obstacle", "primary_area": "", "supplementary_material": "/attachment/b9db646c038bbea9f8a0cd125b12ba4800f88a20.zip", "author": "David Snyder;Meghan Booker;Nathaniel Simon;Wenhan Xia;Daniel Suo;Elad Hazan;Anirudha Majumdar", "authorids": "~David_Snyder2;~Meghan_Booker1;~Nathaniel_Simon1;~Wenhan_Xia1;~Daniel_Suo1;~Elad_Hazan1;~Anirudha_Majumdar1", "gender": "M;;;F;M;M;M", "homepage": "https://irom-lab.princeton.edu/;https://megbooker.com;;https://wenhanlunaxia.github.io/;https://danielsuo.com;https://www.ehazan.com;https://irom-lab.princeton.edu/majumdar/", "dblp": ";;;;;72/739;116/6436", "google_scholar": ";;x3m1e5MAAAAJ;;;LnhCGNMAAAAJ;ibu3FwsAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~David_Snyder2;~Meghan_Booker1;~Nathaniel_Simon1;~Wenhan_Xia1;~Daniel_Suo1;~Elad_Hazan1;~Anirudha_Majumdar1", "aff": "Princeton University;Princeton University;Princeton University;Princeton University;;Princeton University;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;princeton.edu;;princeton.edu;princeton.edu", "position": "PhD student;PhD student;PhD student;PhD student;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nsnyder2023online,\ntitle={Online Learning for Obstacle Avoidance},\nauthor={David Snyder and Meghan Booker and Nathaniel Simon and Wenhan Xia and Daniel Suo and Elad Hazan and Anirudha Majumdar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=HYka22IcV6}\n}", "github": "", "project": "", "reviewers": "x84o;63fg;DrfS", "site": "https://openreview.net/forum?id=HYka22IcV6", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;3", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 19, "authors#_avg": 7, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10988661629297992940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "HtJE9ly5dT", "title": "Generative Skill Chaining: Long-Horizon Skill Planning with Diffusion Models", "track": "main", "status": "Poster", "tldr": "We introduce Generative Skill Chaining, a probabilistic framework that learns skill-centric diffusion models and composes their learned distributions to generate long-horizon plans for unseen task skeletons during inference.", "abstract": "Long-horizon tasks, usually characterized by complex subtask dependencies, present a significant challenge in manipulation planning. Skill chaining is a practical approach to solving unseen tasks by combining learned skill priors. However, such methods are myopic if sequenced greedily and face scalability issues with search-based planning strategy. To address these challenges, we introduce Generative Skill Chaining (GSC), a probabilistic framework that learns skill-centric diffusion models and composes their learned distributions to generate long-horizon plans during inference. GSC samples from all skill models in parallel to efficiently solve unseen tasks while enforcing geometric constraints. We evaluate the method on various long-horizon tasks and demonstrate its capability in reasoning about action dependencies, constraint handling, and generalization, along with its ability to replan in the face of perturbations. We show results in simulation and on real robot to validate the efficiency and scalability of GSC, highlighting its potential for advancing long-horizon task planning. More details are available at: https://generative-skill-chaining.github.io/", "keywords": "Manipulation Planning;Diffusion Models;Task and Motion Planning", "primary_area": "", "supplementary_material": "/attachment/d5f1806726d1ae3f5620727e76b62b627267a20e.zip", "author": "Utkarsh Aashu Mishra;Shangjie Xue;Yongxin Chen;Danfei Xu", "authorids": "~Utkarsh_Aashu_Mishra2;~Shangjie_Xue1;~Yongxin_Chen1;~Danfei_Xu1", "gender": "M;M;M;M", "homepage": "http://utkarshmishra04.github.io/;https://xsj01.github.io/;https://yongxin.ae.gatech.edu/;https://cs.stanford.edu/~danfei/", "dblp": "274/2706;283/5868;;135/8443", "google_scholar": "10HbT44AAAAJ;beSmo9QAAAAJ;X8BYiV4AAAAJ;J5D4kcoAAAAJ", "orcid": "0000-0002-4977-5187;0000-0003-2127-3414;;", "linkedin": "utkarshamishra/;;;", "or_profile": "~Utkarsh_Aashu_Mishra2;~Shangjie_Xue1;~Yongxin_Chen1;~Danfei_Xu1", "aff": "Sony R&D US Labs;Georgia Institute of Technology;Georgia Institute of Technology;NVIDIA", "aff_domain": "sony.com;gatech.edu;gatech.edu;nvidia.com", "position": "Intern;PhD student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nmishra2023generative,\ntitle={Generative Skill Chaining: Long-Horizon Skill Planning with Diffusion Models},\nauthor={Utkarsh Aashu Mishra and Shangjie Xue and Yongxin Chen and Danfei Xu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=HtJE9ly5dT}\n}", "github": "https://github.com/generative-skill-chaining/gsc-code", "project": "", "reviewers": "oxXV;1DYZ;VAfs", "site": "https://openreview.net/forum?id=HtJE9ly5dT", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 79, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12452273944817917586&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Sony;Georgia Institute of Technology;NVIDIA", "aff_unique_dep": "R&D;;NVIDIA Corporation", "aff_unique_url": "https://www.sony.com;https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Sony R&D US Labs;Georgia Tech;NVIDIA", "aff_campus_unique_index": "0", "aff_campus_unique": "United States;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "IM8zOC94HF", "title": "Fighting Uncertainty with Gradients: Offline Reinforcement Learning via Diffusion Score Matching", "track": "main", "status": "Poster", "tldr": "We propose Score Guided Planning (SGP), a gradient-based offline MBRL algorithm that stably minimizes uncertainty in high dimensions.", "abstract": "Gradient-based methods enable efficient search capabilities in high dimensions. However, in order to apply them effectively in offline optimization paradigms such as offline Reinforcement Learning (RL) or Imitation Learning (IL), we require a more careful consideration of how uncertainty estimation interplays with first-order methods that attempt to minimize them. We study smoothed distance to data as an uncertainty metric, and claim that it has two beneficial properties: (i) it allows gradient-based methods that attempt to minimize uncertainty to drive iterates to data as smoothing is annealed, and (ii) it facilitates analysis of model bias with Lipschitz constants. As distance to data can be expensive to compute online, we consider settings where we need amortize this computation. Instead of learning the distance however, we propose to learn its gradients directly as an oracle for first-order optimizers. We show these gradients can be efficiently learned with score-matching techniques by leveraging the equivalence between distance to data and data likelihood. Using this insight, we propose Score-Guided Planning (SGP), a planning algorithm for offline RL that utilizes score-matching to enable first-order planning in high-dimensional problems, where zeroth-order methods were unable to scale, and ensembles were unable to overcome local minima. Website: https://sites.google.com/view/score-guided-planning/home", "keywords": "Model-Based Reinforcement Learning;Offline Reinforcement Learning;Planning under Uncertainty;Diffusion;Score Matching", "primary_area": "", "supplementary_material": "/attachment/eed51128b1075c38e9d8bd2546459ce70b1373ef.zip", "author": "H.J. Terry Suh;Glen Chou;Hongkai Dai;Lujie Yang;Abhishek Gupta;Russ Tedrake", "authorids": "~H.J._Terry_Suh1;~Glen_Chou1;~Hongkai_Dai1;~Lujie_Yang1;~Abhishek_Gupta1;~Russ_Tedrake1", "gender": "M;;;;M;M", "homepage": "https://www.hjrobotics.net;https://glenchou.github.io;;;https://homes.cs.washington.edu/~abhgupta/;http://people.csail.mit.edu/russt", "dblp": ";190/7534;;;18/6404-4;73/1296", "google_scholar": "B6iZVE0AAAAJ;90whi3wAAAAJ;ZZsEXLAAAAAJ;oU5haR0AAAAJ;1wLVDP4AAAAJ;nxNkEiYAAAAJ", "orcid": ";0000-0003-4444-3631;;;;", "linkedin": ";;;;;", "or_profile": "~H.J._Terry_Suh1;~Glen_Chou1;~Hongkai_Dai1;~Lujie_Yang1;~Abhishek_Gupta1;~Russ_Tedrake1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Toyota Research Institute;Massachusetts Institute of Technology;University of Washington;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;tri.global;mit.edu;uw.edu;mit.edu", "position": "PhD student;Postdoc;Researcher;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nsuh2023fighting,\ntitle={Fighting Uncertainty with Gradients: Offline Reinforcement Learning via Diffusion Score Matching},\nauthor={H.J. Terry Suh and Glen Chou and Hongkai Dai and Lujie Yang and Abhishek Gupta and Russ Tedrake},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=IM8zOC94HF}\n}", "github": "https://github.com/hjsuh94/score_po", "project": "", "reviewers": "9mZ5;X384;Esgu;bqCh", "site": "https://openreview.net/forum?id=IM8zOC94HF", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;2;3", "rating_avg": 7.0, "confidence_avg": 3.25, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": -0.17407765595569782, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13819223087462678900&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Research Institute;University of Washington", "aff_unique_dep": ";;", "aff_unique_url": "https://web.mit.edu;https://www.tri.global;https://www.washington.edu", "aff_unique_abbr": "MIT;TRI;UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Id4b5SY1Y8", "title": "PairwiseNet: Pairwise Collision Distance Learning for High-dof Robot Systems", "track": "main", "status": "Poster", "tldr": "PairwiseNet is a novel method that estimates the pairwise collision distance between pairs of elements in a robot system, providing an alternative approach to data-driven methods that estimate the global collision distance.", "abstract": "Motion planning for robot manipulation systems operating in complex environments remains a challenging problem. It requires the evaluation of both the collision distance and its derivative. Owing to its computational complexity, recent studies have attempted to utilize data-driven approaches to learn the collision distance. However, their performance degrades significantly for complicated high-dof systems, such as multi-arm robots. Additionally, the model must be retrained every time the environment undergoes even slight changes. In this paper, we propose PairwiseNet, a model that estimates the minimum distance between two geometric shapes and overcomes many of the limitations of current models. By dividing the problem of global collision distance learning into smaller pairwise sub-problems, PairwiseNet can be used to efficiently calculate the global collision distance. PairwiseNet can be deployed without further modifications or training for any system comprised of the same shape elements (as those in the training dataset). Experiments with multi-arm manipulation systems of various dof indicate that our model achieves significant performance improvements concerning several performance metrics, especially the false positive rate with the collision-free guaranteed threshold. Results further demonstrate that our single trained PairwiseNet model is applicable to all multi-arm systems used in the evaluation. The code is available at https://github.com/kjh6526/PairwiseNet.", "keywords": "Robot Collision;Collision Distance;Machine Learning", "primary_area": "", "supplementary_material": "/attachment/95e2a3a94d869151611742f4fdcd445b5995365c.zip", "author": "Jihwan Kim;Frank C. Park", "authorids": "~Jihwan_Kim2;~Frank_C._Park1", "gender": "M;M", "homepage": "http://robot.snu.ac.kr/;http://robotics.snu.ac.kr", "dblp": ";p/FrankChongwooPark", "google_scholar": ";u-h3PJIAAAAJ", "orcid": ";0000-0002-0293-6975", "linkedin": ";", "or_profile": "~Jihwan_Kim2;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr", "position": "PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2023pairwisenet,\ntitle={PairwiseNet: Pairwise Collision Distance Learning for High-dof Robot Systems},\nauthor={Jihwan Kim and Frank C. Park},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Id4b5SY1Y8}\n}", "github": "https://github.com/kjh6526/PairwiseNet", "project": "", "reviewers": "fxzY;vQwN;uPAS;xJsm", "site": "https://openreview.net/forum?id=Id4b5SY1Y8", "pdf_size": 0, "rating": "1;6;6;6", "confidence": "5;4;4;4", "rating_avg": 4.75, "confidence_avg": 4.25, "replies_avg": 19, "authors#_avg": 2, "corr_rating_confidence": -1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7508225388609916046&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "South Korea" }, { "id": "IeKC9khX5jD", "title": "Affordance-Driven Next-Best-View Planning for Robotic Grasping", "track": "main", "status": "Poster", "tldr": "", "abstract": "Grasping occluded objects in cluttered environments is an essential component in complex robotic manipulation tasks. In this paper, we introduce an AffordanCE-driven Next-Best-View planning policy (ACE-NBV) that tries to find a feasible grasp for target object via continuously observing scenes from new viewpoints. This policy is motivated by the observation that the grasp affordances of an occluded object can be better-measured under the view when the view-direction are the same as the grasp view. Specifically, our method leverages the paradigm of novel view imagery to predict the grasps affordances under previously unobserved view, and select next observation view based on the highest imagined grasp quality of the target object. The experimental results in simulation and on a real robot demonstrate the effectiveness of the proposed affordance-driven next-best-view planning policy. Project page: https://sszxc.net/ace-nbv/.", "keywords": "Grasp Synthesis;Neural SDF;Next-Best-View Planning", "primary_area": "", "supplementary_material": "/attachment/c06e40afee640e4ab1dd4c45f18340ad96e82b75.zip", "author": "Xuechao Zhang;Dong Wang;Sun Han;Weichuang Li;Bin Zhao;Zhigang Wang;Xiaoming Duan;Chongrong Fang;Xuelong Li;Jianping He", "authorids": "~Xuechao_Zhang1;~Dong_Wang1;~Sun_Han1;~Weichuang_Li1;~Bin_Zhao7;~Zhigang_Wang3;~Xiaoming_Duan1;~Chongrong_Fang1;~Xuelong_Li2;~Jianping_He1", "gender": "M;M;Non-Binary;M;M;;M;M;M;M", "homepage": ";https://redwang.github.io/;https://github.com/sunhan1997;https://www.waytron.net/;https://iopen.nwpu.edu.cn/info/1347/2105.htm;https://xmduan.github.io/index.html;https://automation.sjtu.edu.cn/FCR;;https://iwin-fins.com/;", "dblp": ";40/3934-28;;318/0593;73/4325-1.html;;;l/XuelongLi;;35/1989-2", "google_scholar": ";dasL9V4AAAAJ;;742-_K0AAAAJ;https://scholar.google.com.hk/citations?user=DQB0hqwAAAAJ;;;ahUibskAAAAJ;;cw3EaAYAAAAJ", "orcid": ";;;;;;;;;", "linkedin": "sszxc/;;;;;;;;;", "or_profile": "~Xuechao_Zhang1;~Dong_Wang1;~Sun_Han1;~Weichuang_Li1;~Bin_Zhao7;~Xiaoming_Duan1;~Chongrong_Fang1;~Xuelong_Li2;~Jianping_He1;~Zhi.gang_Wang1", "aff": "Shanghai Jiaotong University;Shanghai AI Laboratory;;Shanghai AI Laboratory;Northwest Polytechnical University Xi'an;Shanghai Jiaotong University;Shanghai Jiaotong University;Northwestern Polytechnical University;Shanghai Jiaotong University;Shanghai AI Lab", "aff_domain": "sjtu.edu.cn;pjlab.org.cn;;pjlab.org.cn;nwpu.edu.cn;sjtu.edu.cn;sjtu.edu.cn;nwpu.edu.cn;sjtu.edu.cn;pjlab.org.cn", "position": "MS student;Researcher;;Intern;Associate Professor;Assistant Professor;Assistant Professor;Full Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nzhang2023affordancedriven,\ntitle={Affordance-Driven Next-Best-View Planning for Robotic Grasping},\nauthor={Xuechao Zhang and Dong Wang and Sun Han and Weichuang Li and Bin Zhao and Zhigang Wang and Xiaoming Duan and Chongrong Fang and Xuelong Li and Jianping He},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=IeKC9khX5jD}\n}", "github": "", "project": "", "reviewers": "hPBU;3RPC;pq4R;6CeP", "site": "https://openreview.net/forum?id=IeKC9khX5jD", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;5;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 10, "corr_rating_confidence": 0.816496580927726, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8409537177197749554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;0;0;3;0;4", "aff_unique_norm": "Shanghai Jiao Tong University;Shanghai AI Laboratory;Northwest Polytechnical University;Northwestern Polytechnical University;Shanghai AI Lab", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.shanghai-ai-lab.com;http://www.nwpu.edu.cn;https://www.nwpu.edu.cn;https://www.shanghaiailab.com", "aff_unique_abbr": "SJTU;SAIL;NWPU;NWPU;SAIL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "JdpleC92J4", "title": "AR2-D2: Training a Robot Without a Robot", "track": "main", "status": "Poster", "tldr": "A novel framework for collecting trainable robot demonstrations without the need for a real robot.", "abstract": "Diligently gathered human demonstrations serve as the unsung heroes empowering the progression of robot learning.\nToday, demonstrations are collected by training people to use specialized controllers, which (tele-)operate robots to manipulate a small number of objects. By contrast, we introduce AR2-D2: a system for collecting demonstrations which (1) does not require people with specialized training, (2) does not require any real robots during data collection, and therefore, (3) enables manipulation of diverse objects with a real robot. AR2-D2 is a framework in the form of an iOS app that people can use to record a video of themselves manipulating any object while simultaneously capturing essential data modalities for training a real robot. We show that data collected via our system enables the training of behavior cloning agents in manipulating real objects. Our experiments further show that training with our AR data is as effective as training with real-world robot demonstrations. Moreover, our user study indicates that users find AR2-D2 intuitive to use and require no training in contrast to four other frequently employed methods for collecting robot demonstrations.", "keywords": "Demonstration collection;Imitation learning;Augmented reality;Manipulating personalized objects;Dataset collection;Behavior Cloning", "primary_area": "", "supplementary_material": "/attachment/079f5cb8fa95ce856a8bf84b8fa39b895d796e07.zip", "author": "Jiafei Duan;Yi Ru Wang;Mohit Shridhar;Dieter Fox;Ranjay Krishna", "authorids": "~Jiafei_Duan1;~Yi_Ru_Wang1;~Mohit_Shridhar1;~Dieter_Fox1;~Ranjay_Krishna1", "gender": "M;;M;M;M", "homepage": "https://duanjiafei.com/;;http://mohitshridhar.com/;https://homes.cs.washington.edu/~fox/;http://ranjaykrishna.com", "dblp": "275/9973.html;302/0208;203/8577.html;f/DieterFox;167/3785", "google_scholar": "d1WCSJIAAAAJ;OTL-u30AAAAJ;CrfsfFSiS0kC;DqXsbPAAAAAJ;IcqahyAAAAAJ", "orcid": ";;0000-0001-7382-763X;;0000-0001-8784-2531", "linkedin": "jiafei-duan-a69b11112/;yi-ru-helen-wang/;;;ranjay-krishna-1a344444/", "or_profile": "~Jiafei_Duan1;~Yi_Ru_Wang1;~Mohit_Shridhar1;~Dieter_Fox1;~Ranjay_Krishna1", "aff": "University of Washington;University of Washington;Department of Computer Science, University of Washington;Department of Computer Science;University of Washington", "aff_domain": "uw.edu;washington.edu;cs.washington.edu;cs.washington.edu;cs.washington.edu", "position": "PhD student;PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nduan2023ard,\ntitle={{AR}2-D2: Training a Robot Without a Robot},\nauthor={Jiafei Duan and Yi Ru Wang and Mohit Shridhar and Dieter Fox and Ranjay Krishna},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=JdpleC92J4}\n}", "github": "https://github.com/jiafei1224/AR2-D2_Utils", "project": "", "reviewers": "sq69;FwiR;PnSQ;UipD", "site": "https://openreview.net/forum?id=JdpleC92J4", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;5;4", "rating_avg": 7.0, "confidence_avg": 4.25, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12395870291507769754&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Washington;Unknown Institution", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.washington.edu;", "aff_unique_abbr": "UW;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "JkFeyEC6VXV", "title": "Finetuning Offline World Models in the Real World", "track": "main", "status": "Oral", "tldr": "We propose a framework for offline-to-online finetuning of world models and demonstrate its effectiveness in real robot tasks.", "abstract": "Reinforcement Learning (RL) is notoriously data-inefficient, which makes training on a real robot difficult. While model-based RL algorithms (world models) improve data-efficiency to some extent, they still require hours or days of interaction to learn skills. Recently, offline RL has been proposed as a framework for training RL policies on pre-existing datasets without any online interaction. However, constraining an algorithm to a fixed dataset induces a state-action distribution shift between training and inference, and limits its applicability to new tasks. In this work, we seek to get the best of both worlds: we consider the problem of pretraining a world model with offline data collected on a real robot, and then finetuning the model on online data collected by planning with the learned model. To mitigate extrapolation errors during online interaction, we propose to regularize the planner at test-time by balancing estimated returns and (epistemic) model uncertainty. We evaluate our method on a variety of visuo-motor control tasks in simulation and on a real robot, and find that our method enables few-shot finetuning to seen and unseen tasks even when offline data is limited. Videos are available at https://yunhaifeng.com/FOWM", "keywords": "Model-Based Reinforcement Learning;Real-World Robotics", "primary_area": "", "supplementary_material": "/attachment/7178f1121edca537d16db93a56d2bc12ecd19a2f.zip", "author": "Yunhai Feng;Nicklas Hansen;Ziyan Xiong;Chandramouli Rajagopalan;Xiaolong Wang", "authorids": "~Yunhai_Feng1;~Nicklas_Hansen1;~Ziyan_Xiong2;~Chandramouli_Rajagopalan1;~Xiaolong_Wang3", "gender": ";Non-Binary;M;M;M", "homepage": ";https://nicklashansen.github.io;https://ziyanx02.github.io/;https://chamorajg.github.io/;https://xiaolonw.github.io/", "dblp": ";258/0744.html;;;91/952-4", "google_scholar": ";OFtDgzwAAAAJ;;;Y8O9N_0AAAAJ", "orcid": ";0000-0001-9897-4003;;;", "linkedin": ";ncklas;;;", "or_profile": "~Yunhai_Feng1;~Nicklas_Hansen1;~Ziyan_Xiong2;~Chandramouli_Rajagopalan1;~Xiaolong_Wang3", "aff": ";University of California, San Diego;Tsinghua University;Cerenaut AI;University of California, San Diego", "aff_domain": ";ucsd.edu;tsinghua.edu.cn;cerenaut.ai;ucsd.edu", "position": ";PhD student;Undergrad student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nfeng2023finetuning,\ntitle={Finetuning Offline World Models in the Real World},\nauthor={Yunhai Feng and Nicklas Hansen and Ziyan Xiong and Chandramouli Rajagopalan and Xiaolong Wang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=JkFeyEC6VXV}\n}", "github": "", "project": "", "reviewers": "UN8h;LZqg;mpkG;S26C", "site": "https://openreview.net/forum?id=JkFeyEC6VXV", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;3;3;3", "rating_avg": 8.0, "confidence_avg": 3.25, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2952097136616764913&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of California, San Diego;Tsinghua University;Cerenaut AI", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.tsinghua.edu.cn;", "aff_unique_abbr": "UCSD;THU;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "K8cGp8rVLIP", "title": "Neural Field Dynamics Model for Granular Object Piles Manipulation", "track": "main", "status": "Poster", "tldr": "Our approach combines trajectory optimization and differentiable rendering for granular object manipulation. It introduces a unified density-field-based representation for object states and actions, utilizing a FCN to predict physical dynamics.", "abstract": "We present a learning-based dynamics model for granular material manipulation. Drawing inspiration from computer graphics' Eulerian approach, our method adopts a fully convolutional neural network that operates on a density field-based representation of object piles, allowing it to exploit the spatial locality of inter-object interactions through the convolution operations. This approach greatly improves the learning and computation efficiency compared to existing latent or particle-based methods and sidesteps the need for state estimation, making it directly applicable to real-world settings. Furthermore, our differentiable action rendering module makes the model fully differentiable and can be directly integrated with a gradient-based algorithm for curvilinear trajectory optimization. We evaluate our model with a wide array of piles manipulation tasks both in simulation and real-world experiments and demonstrate that it significantly exceeds existing methods in both accuracy and computation efficiency. More details can be found at https://sites.google.com/view/nfd-corl23/", "keywords": "Deformable Object Manipulation;Manipulation Planning", "primary_area": "", "supplementary_material": "/attachment/35def0274fc45b2fc71eb179719bd6916685a9c8.zip", "author": "Shangjie Xue;Shuo Cheng;Pujith Kachana;Danfei Xu", "authorids": "~Shangjie_Xue1;~Shuo_Cheng1;superp329@gmail.com;~Danfei_Xu1", "gender": "M;M;;M", "homepage": "https://xsj01.github.io/;https://sites.google.com/view/shuocheng/home;;https://cs.stanford.edu/~danfei/", "dblp": "283/5868;179/0863;;135/8443", "google_scholar": "beSmo9QAAAAJ;5CL_0qMAAAAJ;;J5D4kcoAAAAJ", "orcid": "0000-0003-2127-3414;;;", "linkedin": ";;;", "or_profile": "~Shangjie_Xue1;~Shuo_Cheng1;superp329@gmail.com;~Danfei_Xu1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;;NVIDIA", "aff_domain": "gatech.edu;gatech.edu;;nvidia.com", "position": "PhD student;PhD student;;Research Scientist", "bibtex": "@inproceedings{\nxue2023neural,\ntitle={Neural Field Dynamics Model for Granular Object Piles Manipulation},\nauthor={Shangjie Xue and Shuo Cheng and Pujith Kachana and Danfei Xu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=K8cGp8rVLIP}\n}", "github": "https://sites.google.com/view/nfd-corl23/", "project": "", "reviewers": "9WbQ;Un8n;khMs;LnEd", "site": "https://openreview.net/forum?id=K8cGp8rVLIP", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6203871934552026122&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "MF_cS7TCYk", "title": "Multi-Predictor Fusion: Combining Learning-based and Rule-based Trajectory Predictors", "track": "main", "status": "Poster", "tldr": "We present an approach that combines learning- and rule-based trajectory predictors via recursive Bayesian filtering and demonstrate its ability to deliver more consistent performance than the standalone predictors.", "abstract": "Trajectory prediction modules are key enablers for safe and efficient planning of autonomous vehicles (AVs), particularly in highly interactive traffic scenarios. Recently, learning-based trajectory predictors have experienced considerable success in providing state-of-the-art performance due to their ability to learn multimodal behaviors of other agents from data. In this paper, we present an algorithm called multi-predictor fusion (MPF) that augments the performance of learning-based predictors by imbuing them with motion planners that are tasked with satisfying logic-based rules. MPF probabilistically combines learning- and rule-based predictors by mixing trajectories from both standalone predictors in accordance with a belief distribution that reflects the online performance of each predictor. In our results, we show that MPF outperforms the two standalone predictors on various metrics and delivers the most consistent performance.", "keywords": "trajectory prediction;rule-based planning", "primary_area": "", "supplementary_material": "/attachment/d15d553adf78fcc5ffb95dac977aa4bca3f20bc1.zip", "author": "Sushant Veer;Apoorva Sharma;Marco Pavone", "authorids": "~Sushant_Veer1;~Apoorva_Sharma1;~Marco_Pavone1", "gender": "M;M;M", "homepage": ";https://web.stanford.edu/~apoorva;https://web.stanford.edu/~pavone/", "dblp": "173/5950;181/4231;91/3382-1.html", "google_scholar": "1FiIlQsAAAAJ;3bBgnTIAAAAJ;RhOpyXcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Sushant_Veer1;~Apoorva_Sharma1;~Marco_Pavone1", "aff": "NVIDIA;NVIDIA;Stanford University", "aff_domain": "nvidia.com;nvidia.com;stanford.edu", "position": "Researcher;Researcher;Associate Professor", "bibtex": "@inproceedings{\nveer2023multipredictor,\ntitle={Multi-Predictor Fusion: Combining Learning-based and Rule-based Trajectory Predictors},\nauthor={Sushant Veer and Apoorva Sharma and Marco Pavone},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=MF_cS7TCYk}\n}", "github": "", "project": "", "reviewers": "VeXK;4qup;oqEg", "site": "https://openreview.net/forum?id=MF_cS7TCYk", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;5;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5744592381512737561&as_sdt=5,47&sciodt=0,47&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "NVIDIA;Stanford University", "aff_unique_dep": "NVIDIA Corporation;", "aff_unique_url": "https://www.nvidia.com;https://www.stanford.edu", "aff_unique_abbr": "NVIDIA;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "MnANx01rV2w", "title": "CAJun: Continuous Adaptive Jumping using a Learned Centroidal Controller", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present CAJun, a novel hierarchical learning and control framework that enables legged robots to jump continuously with adaptive jumping distances. CAJun consists of a high-level centroidal policy and a low-level leg controller. In particular, we use reinforcement learning (RL) to train the centroidal policy, which specifies the gait timing, base velocity, and swing foot position for the leg controller. The leg controller optimizes motor commands for the swing and stance legs according to the gait timing to track the swing foot target and base velocity commands.% using optimal control. Additionally, we reformulate the stance leg optimizer in the leg controller to speed up policy training by an order of magnitude. Our system combines the versatility of learning with the robustness of optimal control.\n% By combining RL with optimal control methods, our system achieves the versatility of learning while enjoys the robustness from control methods.% making it easily transferable to real robots. We show that after 20 minutes of training on a single GPU, CAJun can achieve continuous, long jumps with adaptive distances on a Go1 robot with small sim-to-real gaps. Moreover, the robot can jump across gaps with a maximum width of 70cm, which is over 40% wider than existing methods.", "keywords": "Jumping;Legged Locomotion;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/d522d9d44c1a5fd29849259b9ba598d44ad25210.zip", "author": "Yuxiang Yang;Guanya Shi;Xiangyun Meng;Wenhao Yu;Tingnan Zhang;Jie Tan;Byron Boots", "authorids": "~Yuxiang_Yang2;~Guanya_Shi1;~Xiangyun_Meng1;~Wenhao_Yu1;~Tingnan_Zhang1;~Jie_Tan1;~Byron_Boots1", "gender": "M;M;;M;M;M;", "homepage": "https://yxyang.github.io;http://guanyashi.github.io;https://homes.cs.washington.edu/~xiangyun;https://wenhaoyu.weebly.com/;;http://www.jie-tan.net;", "dblp": ";230/4386;169/3352;;https://dblp.uni-trier.de/pers/hd/z/Zhang:Tingnan;81/7419;", "google_scholar": "2NQKmzIAAAAJ;joR1Z4UAAAAJ;;1bF2s2kAAAAJ;RM2vMNcAAAAJ;neGbgzYAAAAJ;", "orcid": ";0000-0002-9075-3705;;;;;", "linkedin": ";guanya-shi-b07b43126/;;;;jie-tan/;", "or_profile": "~Yuxiang_Yang2;~Guanya_Shi1;~Xiangyun_Meng1;~Wenhao_Yu1;~Tingnan_Zhang1;~Jie_Tan1;~Byron_Boots1", "aff": "Google;University of Washington;University of Washington;Google;Google;Google;", "aff_domain": "google.com;uw.edu;washington.edu;google.com;google.com;google.com;", "position": "Researcher;Postdoc;PhD student;Software Engineer;Software Engineer;Research Scientist;", "bibtex": "@inproceedings{\nyang2023cajun,\ntitle={{CAJ}un: Continuous Adaptive Jumping using a Learned Centroidal Controller},\nauthor={Yuxiang Yang and Guanya Shi and Xiangyun Meng and Wenhao Yu and Tingnan Zhang and Jie Tan and Byron Boots},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=MnANx01rV2w}\n}", "github": "https://github.com/yxyang/cajun/", "project": "", "reviewers": "JTyM;Dbjy;mtMy", "site": "https://openreview.net/forum?id=MnANx01rV2w", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;5;5", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999997, "gs_citation": 30, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4082556641062932698&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;0;0;0", "aff_unique_norm": "Google;University of Washington", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.washington.edu", "aff_unique_abbr": "Google;UW", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "N3VbFUpwaa", "title": "Generalization of Heterogeneous Multi-Robot Policies via Awareness and Communication of Capabilities", "track": "main", "status": "Poster", "tldr": "We investigate how the awareness and communication of robots capabilities can enable generalization of heterogeneous multi-robot coordination policies training using multi-agent reinforcement learning.", "abstract": "Recent advances in multi-agent reinforcement learning (MARL) are enabling impressive coordination in heterogeneous multi-robot teams. However, existing approaches often overlook the challenge of generalizing learned policies to teams of new compositions, sizes, and robots. While such generalization might not be important in teams of virtual agents that can retrain policies on-demand, it is pivotal in multi-robot systems that are deployed in the real-world and must readily adapt to inevitable changes. As such, multi-robot policies must remain robust to team changes -- an ability we call adaptive teaming. In this work, we investigate if awareness and communication of robot capabilities can provide such generalization by conducting detailed experiments involving an established multi-robot test bed. We demonstrate that shared decentralized policies, that enable robots to be both aware of and communicate their capabilities, can achieve adaptive teaming by implicitly capturing the fundamental relationship between collective capabilities and effective coordination. Videos of trained policies can be viewed at https://sites.google.com/view/cap-comm .", "keywords": "Heterogeneity;Multi-Robot Teaming;Generalization", "primary_area": "", "supplementary_material": "/attachment/8d4a114105b45f40c4198d20cf4426e76be68d7b.zip", "author": "Pierce Howell;Max Rudolph;Reza Joseph Torbati;Kevin Fu;Harish Ravichandar", "authorids": "~Pierce_Howell1;~Max_Rudolph1;~Reza_Joseph_Torbati1;~Kevin_Fu2;~Harish_Ravichandar1", "gender": "M;M;M;;", "homepage": ";https://maxrudolph1.github.io/;;;http://harishravichandar.com/", "dblp": ";298/8056.html;;;237/9959", "google_scholar": "KTovTiQAAAAJ;https://scholar.google.com/citations?view_op=list_works;;;d2HP6SMAAAAJ", "orcid": ";;;;0000-0002-6635-2637", "linkedin": ";;rezatorbati/;kevin-fu-5260341b1/;", "or_profile": "~Pierce_Howell1;~Max_Rudolph1;~Reza_Joseph_Torbati1;~Kevin_Fu2;~Harish_Ravichandar1", "aff": "Georgia Institute of Technology;University of Texas at Austin;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;utexas.edu;gatech.edu;gatech.edu;gatech.edu", "position": "MS student;PhD student;MS student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nhowell2023generalization,\ntitle={Generalization of Heterogeneous Multi-Robot Policies via Awareness and Communication of Capabilities},\nauthor={Pierce Howell and Max Rudolph and Reza Joseph Torbati and Kevin Fu and Harish Ravichandar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=N3VbFUpwaa}\n}", "github": "", "project": "", "reviewers": "jEiU;FZZg;foRa;TgTL", "site": "https://openreview.net/forum?id=N3VbFUpwaa", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;3", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14156129491309648056&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Georgia Institute of Technology;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.gatech.edu;https://www.utexas.edu", "aff_unique_abbr": "Georgia Tech;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Nii0_rRJwN", "title": "CALAMARI: Contact-Aware and Language conditioned spatial Action MApping for contact-RIch manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Making contact with purpose is a central part of robot manipulation and remains essential for many household tasks -- from sweeping dust into a dustpan, to wiping tables; from erasing whiteboards, to applying paint. In this work, we investigate learning language-conditioned, vision-based manipulation policies wherein the action representation is in fact, \\textit{contact itself} -- predicting contact formations at which tools grasped by the robot should meet an observable surface. Our approach, Contact-Aware and Language conditioned spatial Action MApping for contact-RIch manipulation (CALAMARI), exhibits several advantages including (i) benefiting from existing visual-language models for pretrained spatial features, grounding instructions to behaviors, and for sim2real transfer; and (ii) factorizing perception and control over a natural boundary (\\ie contact) into two modules that synergize with each other, whereby action predictions can be aligned per pixel with image observations, and low-level controllers can optimize motion trajectories that maintain contact while avoiding penetration. Experiments show that CALAMARI outperforms existing state-of-the-art model architectures for a broad range of contact-rich tasks, and pushes new ground on embodiment-agnostic generalization to unseen objects with varying elasticity, geometry, and colors in both simulated and real-world settings.", "keywords": "Contact-rich Manipulation;Visual-language guided policies", "primary_area": "", "supplementary_material": "/attachment/68d11981eb32caac0e8f6cb26015e46d7b492d95.zip", "author": "Youngsun Wi;Mark Van der Merwe;Pete Florence;Andy Zeng;Nima Fazeli", "authorids": "~Youngsun_Wi1;~Mark_Van_der_Merwe1;~Pete_Florence1;~Andy_Zeng3;~Nima_Fazeli1", "gender": ";M;;;M", "homepage": "https://www.mmintlab.com/;https://mvandermerwe.github.io/;http://www.peteflorence.com/;https://www.mmintlab.com;http://andyzeng.github.io/", "dblp": ";249/5378;;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy", "google_scholar": ";cKmwbi0AAAAJ;;;q7nFtUcAAAAJ", "orcid": ";;;;", "linkedin": "youngsun-wi-1332761a0/;;;;", "or_profile": "~Youngsun_Wi1;~Mark_Van_der_Merwe1;~Pete_Florence1;~Nima_Fazeli1;~Andy_Zeng1", "aff": "University of Michigan;University of Michigan - Ann Arbor;Google;University of Michigan;Google", "aff_domain": "umich.edu;umich.edu;google.com;umich.edu;google.com", "position": "PhD student;PhD student;Research Scientist;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nwi2023calamari,\ntitle={{CALAMARI}: Contact-Aware and Language conditioned spatial Action {MA}pping for contact-{RI}ch manipulation},\nauthor={Youngsun Wi and Mark Van der Merwe and Pete Florence and Andy Zeng and Nima Fazeli},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Nii0_rRJwN}\n}", "github": "", "project": "", "reviewers": "eq6y;hB1X;GLce;ceQE", "site": "https://openreview.net/forum?id=Nii0_rRJwN", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2494014006528041207&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;1", "aff_unique_norm": "University of Michigan;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.umich.edu;https://www.google.com", "aff_unique_abbr": "UM;Google", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Ann Arbor;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "PK2debCKaG", "title": "Language Conditioned Traffic Generation", "track": "main", "status": "Poster", "tldr": "We propose a language-conditioned traffic scenario generation model powered by LLM.", "abstract": "Simulation forms the backbone of modern self-driving development. Simulators help develop, test, and improve driving systems without putting humans, vehicles, or their environment at risk. However, simulators face a major challenge: They rely on realistic, scalable, yet interesting content. While recent advances in rendering and scene reconstruction make great strides in creating static scene assets, modeling their layout, dynamics, and behaviors remains challenging. In this work, we turn to language as a source of supervision for dynamic traffic scene generation. Our model, LCTGen, combines a large language model with a transformer-based decoder architecture that selects likely map locations from a dataset of maps, and produces an initial traffic distribution, as well as the dynamics of each vehicle. LCTGen outperforms prior work in both unconditional and conditional traffic scene generation in terms of realism and fidelity.", "keywords": "Self-driving;Content generation;Large language model", "primary_area": "", "supplementary_material": "/attachment/ad9e55d54da22a69282c1d060ddc135c8f01909f.zip", "author": "Shuhan Tan;Boris Ivanovic;Xinshuo Weng;Marco Pavone;Philipp Kraehenbuehl", "authorids": "~Shuhan_Tan2;~Boris_Ivanovic1;~Xinshuo_Weng3;~Marco_Pavone1;~Philipp_Kraehenbuehl1", "gender": "M;;F;M;M", "homepage": "https://ariostgx.github.io/website/;http://www.borisivanovic.com/;http://www.xinshuoweng.com;https://web.stanford.edu/~pavone/;http://www.philkr.net/", "dblp": ";203/8356;192/1952.html;91/3382-1.html;43/7592", "google_scholar": "Ro6enEEAAAAJ;ey9AQcEAAAAJ;dthSEsoAAAAJ;RhOpyXcAAAAJ;https://scholar.google.com.tw/citations?user=dzOd2hgAAAAJ", "orcid": ";0000-0002-8698-202X;0000-0002-7894-4381;;", "linkedin": ";boris-ivanovic-a3103064;xinshuoweng;;", "or_profile": "~Shuhan_Tan2;~Boris_Ivanovic1;~Xinshuo_Weng3;~Marco_Pavone1;~Philipp_Kraehenbuehl1", "aff": "NVIDIA;NVIDIA;NVIDIA;Stanford University;Apple", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;stanford.edu;apple.com", "position": "Research Intern;Researcher;Researcher;Associate Professor;Researcher", "bibtex": "@inproceedings{\ntan2023language,\ntitle={Language Conditioned Traffic Generation},\nauthor={Shuhan Tan and Boris Ivanovic and Xinshuo Weng and Marco Pavone and Philipp Kraehenbuehl},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=PK2debCKaG}\n}", "github": "https://github.com/Ariostgx/lctgen", "project": "", "reviewers": "zVxc;iJXw;ixPH;5bXt", "site": "https://openreview.net/forum?id=PK2debCKaG", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "3;4;3;3", "rating_avg": 7.0, "confidence_avg": 3.25, "replies_avg": 7, "authors#_avg": 5, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 63, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14214850468285991828&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "NVIDIA;Stanford University;Apple", "aff_unique_dep": "NVIDIA Corporation;;Apple Inc.", "aff_unique_url": "https://www.nvidia.com;https://www.stanford.edu;https://www.apple.com", "aff_unique_abbr": "NVIDIA;Stanford;Apple", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "PalhNjBJqv", "title": "A Data-efficient Neural ODE Framework for Optimal Control of Soft Manipulators", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper introduces a novel approach for modeling continuous forward kinematic models of soft continuum robots by employing Augmented Neural ODE (ANODE), a cutting-edge family of deep neural network models. To the best of our knowledge, this is the first application of ANODE in modeling soft continuum robots. This formulation introduces auxiliary dimensions, allowing the system's states to evolve in the augmented space which provides a richer set of dynamics that the model can learn, increasing the flexibility and accuracy of the model. Our methodology achieves exceptional sample efficiency, training the continuous forward kinematic model using only 25 scattered data points. Additionally, we design and implement a fully parallel Model Predictive Path Integral (MPPI)-based controller running on a GPU, which efficiently manages a non-convex objective function. Through a set of experiments, we showed that the proposed framework (ANODE+MPPI) significantly outperforms state-of-the-art learning-based methods such as FNN and RNN in unseen-before\n scenarios and marginally outperforms them in seen-before scenarios.", "keywords": "Soft robots;Non-parametric modelling;Optimal control", "primary_area": "", "supplementary_material": "/attachment/701207bd9867381c0c842dbb86994934322d2965.zip", "author": "Mohammadreza Kasaei;Keyhan Kouhkiloui Babarahmati;Zhibin Li;Mohsen Khadem", "authorids": "~Mohammadreza_Kasaei1;~Keyhan_Kouhkiloui_Babarahmati1;~Zhibin_Li2;~Mohsen_Khadem1", "gender": "M;M;M;", "homepage": "https://mohammadkasaei.github.io/Mohammadreza-Kasaei/;;https://www.research.ed.ac.uk/en/persons/zhibin-alex-li/;https://homepages.inf.ed.ac.uk/skhadem/", "dblp": ";;;", "google_scholar": "2aY06V4AAAAJ;jOY4TnoAAAAJ;;https://scholar.google.co.uk/citations?user=EdlB5Q8AAAAJ", "orcid": ";;;", "linkedin": ";keyhankouhkiloui/;;", "or_profile": "~Mohammadreza_Kasaei1;~Keyhan_Kouhkiloui_Babarahmati1;~Zhibin_Li2;~Mohsen_Khadem1", "aff": "University of Edinburgh, University of Edinburgh;University of Edinburgh, University of Edinburgh;University College London, University of London;Edinburgh University, University of Edinburgh", "aff_domain": "ed.ac.uk;ed.ac.uk;ucl.ac.uk;inf.ed.ac.uk", "position": "Postdoc;Postdoc;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nkasaei2023a,\ntitle={A Data-efficient Neural {ODE} Framework for Optimal Control of Soft Manipulators},\nauthor={Mohammadreza Kasaei and Keyhan Kouhkiloui Babarahmati and Zhibin Li and Mohsen Khadem},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=PalhNjBJqv}\n}", "github": "https://github.com/MohammadKasaei/SoftRobotSimulator", "project": "", "reviewers": "e7Lr;pRah;p7k6", "site": "https://openreview.net/forum?id=PalhNjBJqv", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=507016865461962812&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Edinburgh;University College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.ed.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "Edinburgh;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "PsV65r0itpo", "title": "Navigation with Large Language Models: Semantic Guesswork as a Heuristic for Planning", "track": "main", "status": "Poster", "tldr": "Using LLMs as heuristics for planning outperforms popular LLM planning methods and achieves great results on sim benchmarks and real-world evals!", "abstract": "Navigation in unfamiliar environments presents a major challenge for robots: while mapping and planning techniques can be used to build up a representation of the world, quickly discovering a path to a desired goal in unfamiliar settings with such methods often requires lengthy mapping and exploration. Humans can rapidly navigate new environments, particularly indoor environments that are laid out logically, by leveraging semantics --- e.g., a kitchen often adjoins a living room, an exit sign indicates the way out, and so forth. Language models can provide robots with such knowledge, but directly using language models to instruct a robot how to reach some destination can also be impractical: while language models might produce a narrative about how to reach some goal, because they are not grounded in real-world observations, this narrative might be arbitrarily wrong. Therefore, in this paper we study how the ``semantic guesswork'' produced by language models can be utilized as a guiding heuristic for planning algorithms. Our method, Language Frontier Guide (LFG), uses the language model to bias exploration of novel real-world environments by incorporating the semantic knowledge stored in language models as a search heuristic for planning with either topological or metric maps. We evaluate LFG in challenging real-world environments and simulated benchmarks, outperforming uninformed exploration and other ways of using language models.", "keywords": "navigation;language models;planning;semantics", "primary_area": "", "supplementary_material": "/attachment/471413d678fd59d331a46376304a66356f1a0e42.zip", "author": "Dhruv Shah;Michael Robert Equi;B\u0142a\u017cej Osi\u0144ski;Fei Xia;brian ichter;Sergey Levine", "authorids": "~Dhruv_Shah1;~Michael_Robert_Equi1;~B\u0142a\u017cej_Osi\u0144ski1;~Fei_Xia1;~brian_ichter1;~Sergey_Levine1", "gender": "M;M;;M;;M", "homepage": "http://cs.berkeley.edu/~shah;;;;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";;218/5547;;;80/7594", "google_scholar": ";;WuWWdKcAAAAJ;pqP5_PgAAAAJ;-w5DuHgAAAAJ;8R35rCwAAAAJ", "orcid": ";;;0000-0003-4343-1444;;", "linkedin": ";michael-equi/;;;;", "or_profile": "~Dhruv_Shah1;~Michael_Robert_Equi1;~B\u0142a\u017cej_Osi\u0144ski1;~Fei_Xia1;~brian_ichter1;~Sergey_Levine1", "aff": "UC Berkeley;University of California, Berkeley;University of Warsaw;Google;Google;Google", "aff_domain": "berkeley.edu;berkeley.edu;mimuw.edu.pl;google.com;google.com;google.com", "position": "PhD student;Undergrad student;PhD student;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nshah2023navigation,\ntitle={Navigation with Large Language Models: Semantic Guesswork as a Heuristic for Planning},\nauthor={Dhruv Shah and Michael Robert Equi and B{\\l}a{\\.z}ej Osi{\\'n}ski and Fei Xia and brian ichter and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=PsV65r0itpo}\n}", "github": "https://github.com/Michael-Equi/lfg-nav", "project": "", "reviewers": "4mQY;FZa9;oCzb;JLdp", "site": "https://openreview.net/forum?id=PsV65r0itpo", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;5;4;4", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": -0.13245323570650439, "gs_citation": 118, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2534392660281037225&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;2;2", "aff_unique_norm": "University of California, Berkeley;University of Warsaw;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.uw.edu.pl;https://www.google.com", "aff_unique_abbr": "UC Berkeley;UW;Google", "aff_campus_unique_index": "0;0;2;2;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;Poland" }, { "id": "PwqiqaaEzJ", "title": "MUTEX: Learning Unified Policies from Multimodal Task Specifications", "track": "main", "status": "Poster", "tldr": "We introduce Mutex, a unified policy that learns to perform tasks based on task specifications in multiple modalities.", "abstract": "Humans use different modalities, such as speech, text, images, videos, etc., to communicate their intent and goals with teammates. For robots to become better assistants, we aim to endow them with the ability to follow instructions and understand tasks specified by their human partners. Most robotic policy learning methods have focused on one single modality of task specification while ignoring the rich cross-modal information. We present MUTEX, a unified approach to policy learning from multimodal task specifications. It trains a transformer-based architecture to facilitate cross-modal reasoning, combining masked modeling and cross-modal matching objectives in a two-stage training procedure. After training, MUTEX can follow a task specification in any of the six learned modalities (video demonstrations, goal images, text goal descriptions, text instructions, speech goal descriptions, and speech instructions) or a combination of them. We systematically evaluate the benefits of MUTEX in a newly designed dataset with 100 tasks in simulation and 50 tasks in the real world, annotated with multiple instances of task specifications in different modalities, and observe improved performance over methods trained specifically for any single modality. More information at https://ut-austin-rpl.github.io/MUTEX/", "keywords": "Multimodal Learning;Task Specification;Manipulation", "primary_area": "", "supplementary_material": "", "author": "Rutav Shah;Roberto Mart\u00edn-Mart\u00edn;Yuke Zhu", "authorids": "~Rutav_Shah1;~Roberto_Mart\u00edn-Mart\u00edn1;~Yuke_Zhu1", "gender": "M;M;M", "homepage": "https://shahrutav.github.io;https://robertomartinmartin.com/;https://cs.utexas.edu/~yukez/", "dblp": ";153/7670;133/1772", "google_scholar": ";XOJE8OEAAAAJ;mWGyYMsAAAAJ", "orcid": ";0000-0002-9586-2759;", "linkedin": "rutav-shah-01a2941a7;;", "or_profile": "~Rutav_Shah1;~Roberto_Mart\u00edn-Mart\u00edn1;~Yuke_Zhu1", "aff": "University of Texas at Austin;University of Texas at Austin;Computer Science Department, University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;cs.utexas.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nshah2023mutex,\ntitle={{MUTEX}: Learning Unified Policies from Multimodal Task Specifications},\nauthor={Rutav Shah and Roberto Mart{\\'\\i}n-Mart{\\'\\i}n and Yuke Zhu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=PwqiqaaEzJ}\n}", "github": "https://github.com/UT-Austin-RPL/MUTEX", "project": "", "reviewers": "n91Z;JffC;G6C8", "site": "https://openreview.net/forum?id=PwqiqaaEzJ", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;4;4", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 65, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14273204437991450397&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Pwsm7d0iWJD", "title": "Learning Lyapunov-Stable Polynomial Dynamical Systems Through Imitation", "track": "main", "status": "Poster", "tldr": "This research presents an approach to stable motion planning from expert's demonstrations by jointly learning a dynamical system and Lyapunov candidate through constrained polynomial regression.", "abstract": "Imitation learning is a paradigm to address complex motion planning problems by learning a policy to imitate an expert's behavior. However, relying solely on the expert's data might lead to unsafe actions when the robot deviates from the demonstrated trajectories. Stability guarantees have previously been provided utilizing nonlinear dynamical systems, acting as high-level motion planners, in conjunction with the Lyapunov stability theorem. Yet, these methods are prone to inaccurate policies, high computational cost, sample inefficiency, or quasi stability when replicating complex and highly nonlinear trajectories. To mitigate this problem, we present an approach for learning a globally stable nonlinear dynamical system as a motion planning policy. We model the nonlinear dynamical system as a parametric polynomial and learn the polynomial's coefficients jointly with a Lyapunov candidate. To showcase its success, we compare our method against the state of the art in simulation and conduct real-world experiments with the Kinova Gen3 Lite manipulator arm. Our experiments demonstrate the sample efficiency and reproduction accuracy of our method for various expert trajectories, while remaining stable in the face of perturbations.", "keywords": "Imitation learning;Safe learning;Motion planning;Dynamical system;Semidefinite programming;Robotic manipulation", "primary_area": "", "supplementary_material": "/attachment/99ab20731a0ebd86b6338af4aa6b7dab57f6ec44.zip", "author": "Amin Abyaneh;Hsiu-Chin Lin", "authorids": "~Amin_Abyaneh1;~Hsiu-Chin_Lin1", "gender": "F;M", "homepage": "https://sites.google.com/site/hsiuchinlin/;https://www.linkedin.com/in/amin-abyaneh-02a255123?originalSubdomain=ca", "dblp": "77/3470;", "google_scholar": "https://scholar.google.co.uk/citations?user=ZJo2sPEAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Hsiu-Chin_Lin1;~Amin_Mohammad_Soleimani_Abyaneh1", "aff": "McGill University;McGill University", "aff_domain": "mcgill.ca;mcgill.ca", "position": "Assistant Professor;PhD student", "bibtex": "@inproceedings{\nabyaneh2023learning,\ntitle={Learning Lyapunov-Stable Polynomial Dynamical Systems Through Imitation},\nauthor={Amin Abyaneh and Hsiu-Chin Lin},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Pwsm7d0iWJD}\n}", "github": "https://github.com/aminabyaneh/stable-imitation-policy", "project": "", "reviewers": "ZV92;fdCw;x3iz;gRUM", "site": "https://openreview.net/forum?id=Pwsm7d0iWJD", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;5;5;5", "rating_avg": 6.0, "confidence_avg": 5.0, "replies_avg": 17, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1954925401363661374&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "McGill University", "aff_unique_dep": "", "aff_unique_url": "https://www.mcgill.ca", "aff_unique_abbr": "McGill", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Canada" }, { "id": "Q8BGLiWn2X", "title": "PLEX: Making the Most of the Available Data for Robotic Manipulation Pretraining", "track": "main", "status": "Poster", "tldr": "A model architecture for robotic manipulation tailored to the realities of robotic manipulation datasets", "abstract": "A rich representation is key to general robotic manipulation, but existing approaches to representation learning require large amounts of multimodal demonstrations. In this work we propose PLEX, a transformer-based architecture that learns from a small amount of task-agnostic visuomotor trajectories and a much larger amount of task-conditioned object manipulation videos \u2014 a type of data available in quantity. PLEX uses visuomotor trajectories to induce a latent feature space and to learn task-agnostic manipulation routines, while diverse video-only demonstrations teach PLEX how to plan in the induced latent feature space for a wide variety of tasks. Experiments showcase PLEX\u2019s generalization on Meta-World and SOTA performance in challenging Robosuite environments. In particular, using relative positional encoding in PLEX\u2019s transformers greatly helps in low-data regimes of learning from human-collected demonstrations.", "keywords": "Robot learning;Robotic manipulation;Visuomotor representations", "primary_area": "", "supplementary_material": "/attachment/11eef8a1f30d46f5d35a6823e7c8785e80e400db.zip", "author": "Garrett Thomas;Ching-An Cheng;Ricky Loynd;Felipe Vieira Frujeri;Vibhav Vineet;Mihai Jalobeanu;Andrey Kolobov", "authorids": "~Garrett_Thomas1;~Ching-An_Cheng1;~Ricky_Loynd1;~Felipe_Vieira_Frujeri1;~Vibhav_Vineet5;~Mihai_Jalobeanu1;~Andrey_Kolobov1", "gender": "M;M;M;;;;M", "homepage": "https://ai.stanford.edu/~gwthomas/;http://www.chinganc.com;https://www.microsoft.com/en-us/research/people/riloynd/;;;http://mihaij.com/;https://www.microsoft.com/en-us/research/people/akolobov/", "dblp": "186/8227;123/6369;;;;;95/3462", "google_scholar": "Bm3pH5AAAAAJ;bMZFLZ_V4goC;CXPpSu0AAAAJ;wy0FA1cAAAAJ;;;xEWgxBsAAAAJ", "orcid": ";;0000-0003-3541-6586;;;;", "linkedin": ";;ricky-loynd-b7a90452/;;;;", "or_profile": "~Garrett_Thomas1;~Ching-An_Cheng1;~Ricky_Loynd1;~Felipe_Vieira_Frujeri1;~Vibhav_Vineet5;~Mihai_Jalobeanu1;~Andrey_Kolobov1", "aff": "Stanford University;Microsoft Research;Microsoft Research;;;Microsoft Research;Microsoft", "aff_domain": "stanford.edu;microsoft.com;microsoft.com;;;research.microsoft.com;microsoft.com", "position": "PhD student;Principal Researcher;Research software development engineer;;;Software Engineer;Researcher", "bibtex": "@inproceedings{\nthomas2023plex,\ntitle={{PLEX}: Making the Most of the Available Data for Robotic Manipulation Pretraining},\nauthor={Garrett Thomas and Ching-An Cheng and Ricky Loynd and Felipe Vieira Frujeri and Vibhav Vineet and Mihai Jalobeanu and Andrey Kolobov},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Q8BGLiWn2X}\n}", "github": "", "project": "", "reviewers": "aYEz;yTsm;8ioC;aM5x", "site": "https://openreview.net/forum?id=Q8BGLiWn2X", "pdf_size": 0, "rating": "4;4;10;10", "confidence": "5;3;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2964592145896982446&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Stanford University;Microsoft", "aff_unique_dep": ";Microsoft Research", "aff_unique_url": "https://www.stanford.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "Stanford;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Q9ezhChqnL", "title": "Towards Scalable Coverage-Based Testing of Autonomous Vehicles", "track": "main", "status": "Poster", "tldr": "We propose a scalable testing framework for testing self-driving vehicles in parameterized scenario.", "abstract": "To deploy autonomous vehicles(AVs) in the real world, developers must understand the conditions in which the system can operate safely. To do this in a scalable manner, AVs are often tested in simulation on parameterized scenarios. In this context, it\u2019s important to build a testing framework that partitions the scenario parameter space into safe, unsafe, and unknown regions. Existing approaches rely on discretizing continuous parameter spaces into bins, which scales poorly to high-dimensional spaces and cannot describe regions with arbitrary shape. In this work, we introduce a problem formulation which avoids discretization \u2014 by modeling the probability of meeting safety requirements everywhere, the parameter space can be paritioned using a probability threshold. Based on our formulation, we propose GUARD as a testing framework which leverages Gaussian Processes to model probability and levelset algorithms to efficiently generate tests. Moreover, we introduce a set of novel evaluation metrics for coverage-based testing frameworks to capture the key objectives of testing. In our evaluation suite of diverse high-dimensional scenarios, GUARD significantly outperforms existing approaches. By proposing an efficient, accurate, and scalable testing framework, our work is a step towards safely deploying autonomous vehicles at scale.", "keywords": "Self-Driving;Coverage;Testing", "primary_area": "", "supplementary_material": "/attachment/1e5e2eff457a99b63056d193b1a127d015aa87a8.zip", "author": "James Tu;Simon Suo;Chris Zhang;Kelvin Wong;Raquel Urtasun", "authorids": "~James_Tu1;~Simon_Suo2;~Chris_Zhang2;~Kelvin_Wong1;~Raquel_Urtasun1", "gender": "M;M;F;M;M", "homepage": ";https://www.cs.toronto.edu/~kelvinwong/;http://www.cs.toronto.edu/~urtasun/;;http://simonsuo.com", "dblp": ";195/5219;u/RaquelUrtasun;52/6610-1;174/4170", "google_scholar": "https://scholar.google.ca/citations?user=x6gPeg4AAAAJ;K4EqMuAAAAAJ;https://scholar.google.ca/citations?user=jyxO2akAAAAJ;d0BhFY0AAAAJ;vq10uQMAAAAJ", "orcid": ";;;;", "linkedin": ";;;;sdsuo/", "or_profile": "~James_Tu1;~Kelvin_Wong1;~Raquel_Urtasun1;~Chris_Jia_Han_Zhang1;~Simon_Suo1", "aff": "Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto", "aff_domain": "cs.toronto.edu;cs.toronto.edu;cs.toronto.edu;cs.toronto.edu;cs.toronto.edu", "position": "PhD student;PhD student;Full Professor;PhD student;PhD student", "bibtex": "@inproceedings{\ntu2023towards,\ntitle={Towards Scalable Coverage-Based Testing of Autonomous Vehicles},\nauthor={James Tu and Simon Suo and Chris Zhang and Kelvin Wong and Raquel Urtasun},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Q9ezhChqnL}\n}", "github": "", "project": "", "reviewers": "YzeP;QJej;yt5X;WNoS", "site": "https://openreview.net/forum?id=Q9ezhChqnL", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;3;4;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13114010964055154609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Toronto", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "QG_ERxtDAP-", "title": "Curiosity-Driven Learning of Joint Locomotion and Manipulation Tasks", "track": "main", "status": "Poster", "tldr": "An intrinsically motivated RL approach that simplifies learning complex robotics tasks, such as door opening, is proposed and validated.", "abstract": "Learning complex locomotion and manipulation tasks presents significant challenges, often requiring extensive engineering of, e.g., reward functions or curricula to provide meaningful feedback to the Reinforcement Learning (RL) algorithm. This paper proposes an intrinsically motivated RL approach to reduce task-specific engineering. The desired task is encoded in a single sparse reward, i.e., a reward of \u201c+1\u201d is given if the task is achieved. Intrinsic motivation enables learning by guiding exploration toward the sparse reward signal. Specifically, we adapt the idea of Random Network Distillation (RND) to the robotics domain to learn holistic motion control policies involving simultaneous locomotion and manipulation. We investigate opening doors as an exemplary task for robotic ap- plications. A second task involving package manipulation from a table to a bin highlights the generalization capabilities of the presented approach. Finally, the resulting RL policies are executed in real-world experiments on a wheeled-legged robot in biped mode. We experienced no failure in our experiments, which consisted of opening push doors (over 15 times in a row) and manipulating packages (over 5 times in a row).", "keywords": "Curiosity;Reinforcement Learning;Wheeled-Legged Robots", "primary_area": "", "supplementary_material": "/attachment/0d857e99e7db14e417d8bf989bb52e718e54b4ac.zip", "author": "Clemens Schwarke;Victor Klemm;Matthijs van der Boon;Marko Bjelonic;Marco Hutter", "authorids": "~Clemens_Schwarke1;~Victor_Klemm1;~Matthijs_van_der_Boon1;markob@ethz.ch;~Marco_Hutter1", "gender": "M;M;M;;M", "homepage": ";https://victorklemm.com;;;http://www.rsl.ethz.ch", "dblp": ";;;;04/2753", "google_scholar": ";-3pMVPUAAAAJ;EPcO5KIAAAAJ;;https://scholar.google.ch/citations?user=DO3quJYAAAAJ", "orcid": ";0000-0002-6752-3397;0000-0003-2427-704X;;0000-0002-4285-4990", "linkedin": "clemensschwarke/;https://linkedin.com/in/victor-klemm-6a68231ab;;;", "or_profile": "~Clemens_Schwarke1;~Victor_Klemm1;~Matthijs_van_der_Boon1;markob@ethz.ch;~Marco_Hutter1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;;;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;;;ethz.ch", "position": "MS student;PhD student;;;Associate Professor", "bibtex": "@inproceedings{\nschwarke2023curiositydriven,\ntitle={Curiosity-Driven Learning of Joint Locomotion and Manipulation Tasks},\nauthor={Clemens Schwarke and Victor Klemm and Matthijs van der Boon and Marko Bjelonic and Marco Hutter},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=QG_ERxtDAP-}\n}", "github": "", "project": "", "reviewers": "81RD;a9Bt;qvEX;bDmA", "site": "https://openreview.net/forum?id=QG_ERxtDAP-", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;4;4;4", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 15, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13129202036853560266&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "QNPuJZyhFE", "title": "Imitating Task and Motion Planning with Visuomotor Transformers", "track": "main", "status": "Poster", "tldr": "We propose a novel method for training large scale imitation policies for robotic manipulation by distilling Task and Motion Planning into visuomotor Transformers.", "abstract": "Imitation learning is a powerful tool for training robot manipulation policies, allowing them to learn from expert demonstrations without manual programming or trial-and-error. However, common methods of data collection, such as human supervision, scale poorly, as they are time-consuming and labor-intensive. In contrast, Task and Motion Planning (TAMP) can autonomously generate large-scale datasets of diverse demonstrations. In this work, we show that the combination of large-scale datasets generated by TAMP supervisors and flexible Transformer models to fit them is a powerful paradigm for robot manipulation. We present a novel imitation learning system called OPTIMUS that trains large-scale visuomotor Transformer policies by imitating a TAMP agent. We conduct a thorough study of the design decisions required to imitate TAMP and demonstrate that OPTIMUS can solve a wide variety of challenging vision-based manipulation tasks with over 70 different objects, ranging from long-horizon pick-and-place tasks, to shelf and articulated object manipulation, achieving 70 to 80% success rates. Video results and code at https://mihdalal.github.io/optimus/", "keywords": "Imitation Learning;Task and Motion Planning;Transformers", "primary_area": "", "supplementary_material": "/attachment/8d04713fc5ad85e8850dc5acd66cf1676329718d.zip", "author": "Murtaza Dalal;Ajay Mandlekar;Caelan Reed Garrett;Ankur Handa;Ruslan Salakhutdinov;Dieter Fox", "authorids": "~Murtaza_Dalal1;~Ajay_Mandlekar1;~Caelan_Reed_Garrett1;~Ankur_Handa1;~Ruslan_Salakhutdinov1;~Dieter_Fox1", "gender": "M;M;M;M;M;M", "homepage": "https://mihdalal.github.io/;https://ai.stanford.edu/~amandlek/;http://web.mit.edu/caelan/www/;http://ankurhanda.com;https://homes.cs.washington.edu/~fox/;https://www.cs.cmu.edu/~rsalakhu/", "dblp": "215/5516;https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay;161/9727;32/8653;f/DieterFox;", "google_scholar": "5dBp2f4AAAAJ;MEz23joAAAAJ;KVUCqGwAAAAJ;sCTJI-0AAAAJ;DqXsbPAAAAAJ;", "orcid": ";;0000-0002-6474-1276;;;", "linkedin": "murtaza-dalal-9b397a89/;;caelan-garrett-85197977/;;;", "or_profile": "~Murtaza_Dalal1;~Ajay_Mandlekar1;~Caelan_Reed_Garrett1;~Ankur_Handa1;~Dieter_Fox1;~Russ_Salakhutdinov1", "aff": "Carnegie Mellon University;NVIDIA;NVIDIA;Imperial College London;Department of Computer Science;School of Computer Science, Carnegie Mellon University", "aff_domain": "cmu.edu;nvidia.com;nvidia.com;imperial.ac.uk;cs.washington.edu;cs.cmu.edu", "position": "PhD student;Researcher;Researcher;Research Scientist;Full Professor;Full Professor", "bibtex": "@inproceedings{\ndalal2023imitating,\ntitle={Imitating Task and Motion Planning with Visuomotor Transformers},\nauthor={Murtaza Dalal and Ajay Mandlekar and Caelan Reed Garrett and Ankur Handa and Ruslan Salakhutdinov and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=QNPuJZyhFE}\n}", "github": "", "project": "", "reviewers": "k8yb;5Vfg;iSbk;n8nf", "site": "https://openreview.net/forum?id=QNPuJZyhFE", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;3;4;3", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12951067735250072051&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;3;0", "aff_unique_norm": "Carnegie Mellon University;NVIDIA;Imperial College London;Unknown Institution", "aff_unique_dep": ";NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://www.cmu.edu;https://www.nvidia.com;https://www.imperial.ac.uk;", "aff_unique_abbr": "CMU;NVIDIA;ICL;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom;" }, { "id": "RN00jfIV-X", "title": "General In-hand Object Rotation with Vision and Touch", "track": "main", "status": "Poster", "tldr": "We present a reinforcement learning policy capable of rotating a diverse set of objects over multiple axes using its fingertips.", "abstract": "We introduce Rotateit, a system that enables fingertip-based object rotation along multiple axes by leveraging multimodal sensory inputs. Our system is trained in simulation, where it has access to ground-truth object shapes and physical properties. Then we distill it to operate on realistic yet noisy simulated visuotactile and proprioceptive sensory inputs. These multimodal inputs are fused via a visuotactile transformer, enabling online inference of object shapes and physical properties during deployment. We show significant performance improvements over prior methods and highlight the importance of visual and tactile sensing.", "keywords": "In-Hand Object Rotation;Tactile Sensing;Reinforcement Learning;Sim2Real;Transformer;Visuotactile Manipulation", "primary_area": "", "supplementary_material": "", "author": "Haozhi Qi;Brent Yi;Sudharshan Suresh;Mike Lambeta;Yi Ma;Roberto Calandra;Jitendra Malik", "authorids": "~Haozhi_Qi1;~Brent_Yi1;~Sudharshan_Suresh1;~Mike_Lambeta1;~Yi_Ma4;~Roberto_Calandra1;~Jitendra_Malik2", "gender": "M;M;M;M;M;M;M", "homepage": "https://haozhi.io/;;http://www.cs.cmu.edu/~sudhars1/;;http://people.eecs.berkeley.edu/~yima/;https://www.robertocalandra.com;https://people.eecs.berkeley.edu/~malik/", "dblp": "190/7802;239/5167;;;;118/8239;58/2944", "google_scholar": "https://scholar.google.com.hk/citations?user=iyVHKkcAAAAJ;https://scholar.google.com/citations?hl=en;xYC738YAAAAJ;;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ;FdE3LOEAAAAJ;oY9R5YQAAAAJ", "orcid": ";;0000-0001-9030-2800;;;0000-0001-9430-8433;0000-0003-3695-1580", "linkedin": ";;;mike-maroje-lambeta;;rcalandra;", "or_profile": "~Haozhi_Qi1;~Brent_Yi1;~Sudharshan_Suresh1;~Mike_Lambeta1;~Yi_Ma4;~Roberto_Calandra1;~Jitendra_Malik2", "aff": "University of California, Berkeley;University of California, Berkeley;Carnegie Mellon University;Meta;University of California, Berkeley;Meta Facebook;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;cmu.edu;meta.com;berkeley.edu;fb.com;berkeley.edu", "position": "PhD student;PhD student;PhD student;Engineer;Full Professor;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nqi2023general,\ntitle={General In-hand Object Rotation with Vision and Touch},\nauthor={Haozhi Qi and Brent Yi and Sudharshan Suresh and Mike Lambeta and Yi Ma and Roberto Calandra and Jitendra Malik},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=RN00jfIV-X}\n}", "github": "", "project": "", "reviewers": "3t8K;ahST;Kkn8;Ms7j", "site": "https://openreview.net/forum?id=RN00jfIV-X", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;4;3;3", "rating_avg": 6.5, "confidence_avg": 3.75, "replies_avg": 14, "authors#_avg": 7, "corr_rating_confidence": -0.7608859102526822, "gs_citation": 106, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6254107485479000491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;2;0;2;0", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "UC Berkeley;CMU;Meta", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RQ_7yVV8vA", "title": "Learning to See Physical Properties with Active Sensing Motor Policies", "track": "main", "status": "Poster", "tldr": "A legged robot learns to measure terrain properties more accurately by intentionally feeling the ground; its feelings serve as self-supervised labels for a visual perception module; this facilitates locomotion planning in diverse configurations.", "abstract": "To plan efficient robot locomotion, we must use the information about a terrain\u2019s physics that can be inferred from color images. To this end, we train a visual perception module that predicts terrain properties using labels from a small amount of real-world proprioceptive locomotion. To ensure label precision, we introduce Active Sensing Motor Policies (ASMP). These policies are trained to prefer motor skills that facilitate accurately estimating the environment\u2019s physics, like swiping a foot to observe friction. The estimated labels supervise a vision model that infers physical properties directly from color images and can be reused for different tasks. Leveraging a pretrained vision backbone, we demonstrate robust generalization in image space, enabling path planning from overhead imagery despite using only ground camera images for training.", "keywords": "Locomotion;Vision;Navigation;Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Gabriel B. Margolis;Xiang Fu;Yandong Ji;Pulkit Agrawal", "authorids": "~Gabriel_B._Margolis1;~Xiang_Fu4;~Yandong_Ji1;~Pulkit_Agrawal1", "gender": "M;;M;M", "homepage": "https://xiangfu.co/;https://yandongji.github.io;https://people.eecs.berkeley.edu/~pulkitag/;https://gmargo11.github.io/", "dblp": "97/374-5.html;271/8584;149/2672;305/0205", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;LyHzJOMAAAAJ;UpZmJI0AAAAJ;Jzt5uNAAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xiang_Fu4;~Yandong_Ji1;~Pulkit_Agrawal1;~Gabriel_B_Margolis1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Researcher;Assistant Professor;PhD Student", "bibtex": "@inproceedings{\nmargolis2023learning,\ntitle={Learning to See Physical Properties with Active Sensing Motor Policies},\nauthor={Gabriel B. Margolis and Xiang Fu and Yandong Ji and Pulkit Agrawal},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=RQ_7yVV8vA}\n}", "github": "", "project": "", "reviewers": "knX2;zcuB;dDBh;dYv6", "site": "https://openreview.net/forum?id=RQ_7yVV8vA", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;5", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15348641928706544303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "RaNAaxZfKi8", "title": "One-shot Imitation Learning via Interaction Warping", "track": "main", "status": "Poster", "tldr": "We use shape warping for one-shot learning of SE(3) robotic manipulation policies.", "abstract": "Learning robot policies from few demonstrations is crucial in open-ended applications. We propose a new method, Interaction Warping, for one-shot learning SE(3) robotic manipulation policies. We infer the 3D mesh of each object in the environment using shape warping, a technique for aligning point clouds across object instances. Then, we represent manipulation actions as keypoints on objects, which can be warped with the shape of the object. We show successful one-shot imitation learning on three simulated and real-world object re-arrangement tasks. We also demonstrate the ability of our method to predict object meshes and robot grasps in the wild. Webpage: https://shapewarping.github.io.", "keywords": "3D manipulation;imitation learning;shape warping", "primary_area": "", "supplementary_material": "/attachment/5f4eb8dae714638859cb0c0d0236fe98eb3aa6cb.zip", "author": "Ondrej Biza;Skye Thompson;Kishore Reddy Pagidi;Abhinav Kumar;Elise van der Pol;Robin Walters;Thomas Kipf;Jan-Willem van de Meent;Lawson L.S. Wong;Robert Platt", "authorids": "~Ondrej_Biza1;~Skye_Thompson1;~Kishore_Reddy_Pagidi1;~Abhinav_Kumar7;~Elise_van_der_Pol1;~Robin_Walters1;~Thomas_Kipf2;~Jan-Willem_van_de_Meent1;~Lawson_L.S._Wong2;~Robert_Platt1", "gender": "M;M;M;M;F;M;M;;M;M", "homepage": "https://sites.google.com/view/obiza;http://unknown.org;;;http://elisevanderpol.nl;http://www.robinwalters.com;https://jwvdm.github.io/;http://www.ccs.neu.edu/home/rplatt/;https://www.ccs.neu.edu/home/lsw/;http://tkipf.github.io/", "dblp": "230/8616.html;;;;186/8470.html;258/3416;137/3263;39/5434;35/2573;186/8206", "google_scholar": "Gi9Xq8YAAAAJ;;;;https://scholar.google.nl/citations?user=564o-vIAAAAJ;fnprJmUAAAAJ;CX9Lu38AAAAJ;Z4Y5S2oAAAAJ;https://scholar.google.com/citations?hl=en;83HL5FwAAAAJ", "orcid": "0000-0003-3390-8050;;;;;;0000-0001-9465-5398;;;", "linkedin": "ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;;kishore005/;abhinav-kumar2000;;;;;;thomas-kipf-6b260410a", "or_profile": "~Ondrej_Biza1;~Skye_Thompson1;~Kishore_Reddy_Pagidi1;~Abhinav_Kumar7;~Elise_van_der_Pol1;~Robin_Walters1;~Jan-Willem_van_de_Meent1;~Robert_Platt1;~Lawson_L._S._Wong1;~Thomas_N._Kipf1", "aff": "Northeastern University;Brown University;Northeastern University;Northeastern university ;Microsoft Research;Northeastern University ;Northeastern University;Northeastern University;Northeastern University;Google", "aff_domain": "northeastern.edu;brown.edu;neu.edu;northeastern.edu;microsoft.com;northeastern.edu;northeastern.edu;neu.edu;northeastern.edu;google.com", "position": "PhD student;PhD student;MS student;MS student;Researcher;Assistant Professor;Assistant Professor;Associate Professor;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nbiza2023oneshot,\ntitle={One-shot Imitation Learning via Interaction Warping},\nauthor={Ondrej Biza and Skye Thompson and Kishore Reddy Pagidi and Abhinav Kumar and Elise van der Pol and Robin Walters and Thomas Kipf and Jan-Willem van de Meent and Lawson L.S. Wong and Robert Platt},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=RaNAaxZfKi8}\n}", "github": "https://github.com/ondrejbiza/shapewarping", "project": "", "reviewers": "M4iE;odM7;XXj1;mU8N", "site": "https://openreview.net/forum?id=RaNAaxZfKi8", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "5;5;3;3", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 10, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10086054051941946387&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;0;0;2;0;0;0;0;3", "aff_unique_norm": "Northeastern University;Brown University;Microsoft;Google", "aff_unique_dep": ";;Microsoft Research;Google", "aff_unique_url": "https://www.northeastern.edu;https://www.brown.edu;https://www.microsoft.com/en-us/research;https://www.google.com", "aff_unique_abbr": "NEU;Brown;MSR;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Rb0nGIt_kh5", "title": "Distilled Feature Fields Enable Few-Shot Language-Guided Manipulation", "track": "main", "status": "Oral", "tldr": "Distilled Feature Fields Enable Few-Shot Language-Guided Manipulation", "abstract": "Self-supervised and language-supervised image models contain rich knowledge of the world that is important for generalization. Many robotic tasks, however, require a detailed understanding of 3D geometry, which is often lacking in 2D image features. This work bridges this 2D-to-3D gap for robotic manipulation by leveraging distilled feature fields to combine accurate 3D geometry with rich semantics from 2D foundation models. We present a few-shot learning method for 6-DOF grasping and placing that harnesses these strong spatial and semantic priors to achieve in-the-wild generalization to unseen objects. Using features distilled from a vision-language model, CLIP, we present a way to designate novel objects for manipulation via free-text natural language, and demonstrate its ability to generalize to unseen expressions and novel categories of objects. Project website: https://f3rm.csail.mit.edu", "keywords": "Neural Fields;Foundation Models;Scene Understanding;Robot Manipulation", "primary_area": "", "supplementary_material": "/attachment/7bf09cdb360664ee7270c0abd510d7ac7f85959a.zip", "author": "William Shen;Ge Yang;Alan Yu;Jansen Wong;Leslie Pack Kaelbling;Phillip Isola", "authorids": "~William_Shen1;~Ge_Yang1;~Alan_Yu2;~Jansen_Wong1;~Leslie_Pack_Kaelbling1;~Phillip_Isola1", "gender": "M;M;M;M;F;M", "homepage": ";http://www.episodeyang.com;https://alany1.github.io;;http://people.csail.mit.edu/lpk/;http://web.mit.edu/phillipi/", "dblp": ";48/4561-3;;;k/LesliePackKaelbling;36/9988", "google_scholar": "UREPJYIAAAAJ;vaQcF6kAAAAJ;https://scholar.google.com/citations?hl=en;;IcasIiwAAAAJ;ROILf3EAAAAJ", "orcid": ";0000-0001-7520-7055;;;0000-0001-6054-7145;0000-0002-1411-6704", "linkedin": "williamshen-nz/;;;jansenwong/;;phillip-isola-a9955b20/", "or_profile": "~William_Shen1;~Ge_Yang1;~Alan_Yu2;~Jansen_Wong1;~Leslie_Pack_Kaelbling1;~Phillip_Isola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;Undergrad student;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nshen2023distilled,\ntitle={Distilled Feature Fields Enable Few-Shot Language-Guided Manipulation},\nauthor={William Shen and Ge Yang and Alan Yu and Jansen Wong and Leslie Pack Kaelbling and Phillip Isola},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Rb0nGIt_kh5}\n}", "github": "https://github.com/f3rm/f3rm", "project": "", "reviewers": "j1dX;JsoY;7BHn;azcv", "site": "https://openreview.net/forum?id=Rb0nGIt_kh5", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "3;4;4;5", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.7071067811865475, "gs_citation": 113, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2243532271563662553&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "RcZMI8MSyE", "title": "Large Language Models as General Pattern Machines", "track": "main", "status": "Poster", "tldr": "We explore the utility of large language models to function as \"general pattern machines\" for robotics and sequential decision making, enabled by their capability to perform in-context learning over abstract sequences.", "abstract": "We observe that pre-trained large language models (LLMs) are capable of autoregressively completing complex token sequences\u2014from arbitrary ones procedurally generated by probabilistic context-free grammars (PCFG), to more rich spatial patterns found in the Abstraction and Reasoning Corpus (ARC), a general AI benchmark, prompted in the style of ASCII art. Surprisingly, pattern completion proficiency can be partially retained even when the sequences are expressed using tokens randomly sampled from the vocabulary. These results suggest that without any additional training, LLMs can serve as general sequence modelers, driven by in-context learning. In this work, we investigate how these zero-shot capabilities may be applied to problems in robotics\u2014from extrapolating sequences of numbers that represent states over time to complete simple motions, to least-to-most prompting of reward-conditioned trajectories that can discover and represent closed-loop policies (e.g., a stabilizing controller for CartPole). While difficult to deploy today for real systems due to latency, context size limitations, and compute costs, the approach of using LLMs to drive low-level control may provide an exciting glimpse into how the patterns among words could be transferred to actions.", "keywords": "large language models;in-context learning;language for robotics", "primary_area": "", "supplementary_material": "/attachment/802cb0f73cc3a8b2857488e7b5cfd3623cfc7733.zip", "author": "Suvir Mirchandani;Fei Xia;Pete Florence;brian ichter;Danny Driess;Montserrat Gonzalez Arenas;Kanishka Rao;Dorsa Sadigh;Andy Zeng", "authorids": "~Suvir_Mirchandani1;~Fei_Xia1;~Pete_Florence1;~brian_ichter1;~Danny_Driess1;~Montserrat_Gonzalez_Arenas1;~Kanishka_Rao1;~Dorsa_Sadigh1;~Andy_Zeng3", "gender": "M;M;;;;F;;F;M", "homepage": "http://suvirpmirchandani.com;;http://www.peteflorence.com/;;https://dannydriess.github.io/;;https://research.google/people/KanishkaRao/;https://dorsa.fyi/;http://andyzeng.github.io/", "dblp": "287/4981;;;;;;;117/3174;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy", "google_scholar": "fz7LJPIAAAAJ;pqP5_PgAAAAJ;;-w5DuHgAAAAJ;https://scholar.google.de/citations?user=wxnzyjwAAAAJ;;;ZaJEZpYAAAAJ;q7nFtUcAAAAJ", "orcid": ";0000-0003-4343-1444;;;;;;;", "linkedin": ";;;;;montse90/;;;", "or_profile": "~Suvir_Mirchandani1;~Fei_Xia1;~Pete_Florence1;~brian_ichter1;~Danny_Driess1;~Montserrat_Gonzalez_Arenas1;~Kanishka_Rao1;~Dorsa_Sadigh1;~Andy_Zeng1", "aff": "Google;Google;Google;Google;Technische Universit\u00e4t Berlin;;;Stanford University;Google", "aff_domain": "google.com;google.com;google.com;google.com;tu-berlin.de;;;stanford.edu;google.com", "position": "Part-time student researcher;Researcher;Research Scientist;Research Scientist;PhD student;;;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nmirchandani2023large,\ntitle={Large Language Models as General Pattern Machines},\nauthor={Suvir Mirchandani and Fei Xia and Pete Florence and brian ichter and Danny Driess and Montserrat Gonzalez Arenas and Kanishka Rao and Dorsa Sadigh and Andy Zeng},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=RcZMI8MSyE}\n}", "github": "", "project": "", "reviewers": "JNxR;A4xe;seqs;fBNs", "site": "https://openreview.net/forum?id=RcZMI8MSyE", "pdf_size": 0, "rating": "4;4;6;10", "confidence": "4;3;3;4", "rating_avg": 6.0, "confidence_avg": 3.5, "replies_avg": 17, "authors#_avg": 9, "corr_rating_confidence": 0.40824829046386296, "gs_citation": 220, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9776723022121320548&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1;2;0", "aff_unique_norm": "Google;Technische Universit\u00e4t Berlin;Stanford University", "aff_unique_dep": "Google;;", "aff_unique_url": "https://www.google.com;https://www.tu-berlin.de;https://www.stanford.edu", "aff_unique_abbr": "Google;TU Berlin;Stanford", "aff_campus_unique_index": "0;0;0;0;2;0", "aff_campus_unique": "Mountain View;;Stanford", "aff_country_unique_index": "0;0;0;0;1;0;0", "aff_country_unique": "United States;Germany" }, { "id": "SgTPdyehXMA", "title": "Language to Rewards for Robotic Skill Synthesis", "track": "main", "status": "Oral", "tldr": "We propose to use reward function to bridge language model and low-level robot actions for interactive creation of novel behavior from human instructions.", "abstract": "Large language models (LLMs) have demonstrated exciting progress in acquiring diverse new capabilities through in-context learning, ranging from logical reasoning to code-writing. Robotics researchers have also explored using LLMs to advance the capabilities of robotic control. However, since low-level robot actions are hardware-dependent and underrepresented in LLM training corpora, existing efforts in applying LLMs to robotics have largely treated LLMs as semantic planners or relied on human-engineered control primitives to interface with the robot. On the other hand, reward functions are shown to be flexible representations that can be optimized for control policies to achieve diverse tasks, while their semantic richness makes them suitable to be specified by LLMs.\nIn this work, we introduce a new paradigm that harnesses this realization by utilizing LLMs to define reward parameters that can be optimized and accomplish variety of robotic tasks. Using reward as the intermediate interface generated by LLMs, we can effectively bridge the gap between high-level language instructions or corrections to low-level robot actions. Meanwhile, combining this with a real-time optimizer, MuJoCo MPC, empowers an interactive behavior creation experience where users can immediately observe the results and provide feedback to the system.\nTo systematically evaluate the performance of our proposed method, we designed a total of 17 tasks for a simulated quadruped robot and a dexterous manipulator robot. We demonstrate that our proposed method reliably tackles 90% of the designed tasks, while a baseline using primitive skills as the interface with Code-as-policies achieves 50% of the tasks.\nWe further validated our method on a real robot arm where complex manipulation skills such as non-prehensile pushing emerge through our interactive system.", "keywords": "Large language model (LLM);Low-level skill learning;Legged locomotion;Dexterous manipulation", "primary_area": "", "supplementary_material": "/attachment/6bf4ceda018ff83588246b7e2d86ea828c124da2.zip", "author": "Wenhao Yu;Nimrod Gileadi;Chuyuan Fu;Sean Kirmani;Kuang-Huei Lee;Montserrat Gonzalez Arenas;Hao-Tien Lewis Chiang;Tom Erez;Leonard Hasenclever;Jan Humplik;brian ichter;Ted Xiao;Peng Xu;Andy Zeng;Tingnan Zhang;Nicolas Heess;Dorsa Sadigh;Jie Tan;Yuval Tassa;Fei Xia", "authorids": "~Wenhao_Yu1;~Nimrod_Gileadi1;~Chuyuan_Fu1;~Sean_Kirmani1;~Kuang-Huei_Lee1;~Montserrat_Gonzalez_Arenas1;~Hao-Tien_Lewis_Chiang1;~Tom_Erez1;~Leonard_Hasenclever1;~Jan_Humplik1;~brian_ichter1;~Ted_Xiao1;~Peng_Xu9;~Andy_Zeng3;~Tingnan_Zhang1;~Nicolas_Heess1;~Dorsa_Sadigh1;~Jie_Tan1;~Yuval_Tassa2;~Fei_Xia1", "gender": "M;M;F;M;M;F;M;M;M;M;;M;M;M;;F;M;M;M;M", "homepage": "https://wenhaoyu.weebly.com/;;;https://kirmani.io/;https://kuanghuei.github.io/;;https://sites.google.com/view/lewispro/home;;;;;https://www.tedxiao.me;;;;https://dorsa.fyi/;http://www.jie-tan.net;;http://andyzeng.github.io/;", "dblp": ";;;;66/11466;;;http://dblp.uni-trier.de/pers/hd/e/Erez:Tom;150/1667;215/9213;;198/0598;;https://dblp.uni-trier.de/pers/hd/z/Zhang:Tingnan;76/9181;117/3174;81/7419;;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy;20/4415", "google_scholar": "1bF2s2kAAAAJ;snHVatUAAAAJ;bDq7MZMAAAAJ;iyEuK8kAAAAJ;rE7-N30AAAAJ;;megAxigAAAAJ;https://scholar.google.co.uk/citations?user=gVFnjOcAAAAJ;https://scholar.google.co.uk/citations?user=dD-3S4QAAAAJ;YE9w2BsAAAAJ;-w5DuHgAAAAJ;;460NWeQAAAAJ;RM2vMNcAAAAJ;79k7bGEAAAAJ;ZaJEZpYAAAAJ;neGbgzYAAAAJ;pqP5_PgAAAAJ;q7nFtUcAAAAJ;https://scholar.google.co.uk/citations?user=CjOTm_4AAAAJ", "orcid": ";;;;;;;;;;;;;;;;;0000-0003-4343-1444;;", "linkedin": ";nimrod-gileadi-6669b422;;skirmani;;montse90/;hao-tien-lewis-chiang-22598a79/;ereztom;;;;;;;;;jie-tan/;;;", "or_profile": "~Wenhao_Yu1;~Nimrod_Gileadi1;~Chuyuan_Fu1;~Sean_Kirmani1;~Kuang-Huei_Lee1;~Montserrat_Gonzalez_Arenas1;~Hao-Tien_Lewis_Chiang1;~Tom_Erez1;~Leonard_Hasenclever1;~Jan_Humplik1;~brian_ichter1;~Ted_Xiao1;~Peng_Xu9;~Tingnan_Zhang1;~Nicolas_Heess1;~Dorsa_Sadigh1;~Jie_Tan1;~Fei_Xia1;~Andy_Zeng1;~yuval_tassa1", "aff": "Google;Google DeepMind;Google;Google X;Google;;Google Deepmind;;Google DeepMind;Google DeepMind;Google;;Google;Google;Google DeepMind;Stanford University;Google;Google;Google;Google", "aff_domain": "google.com;deepmind.com;google.com;x.team;google.com;;deepmind.com;;google.com;google.com;google.com;;google.com;google.com;google.com;stanford.edu;google.com;google.com;google.com;google.com", "position": "Software Engineer;Software Engineer;software engineer;Researcher;Researcher;;Researcher;;Research Scientist;Research scientist;Research Scientist;;Researcher;Software Engineer;Research Scientist;Assistant Professor;Research Scientist;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nyu2023language,\ntitle={Language to Rewards for Robotic Skill Synthesis},\nauthor={Wenhao Yu and Nimrod Gileadi and Chuyuan Fu and Sean Kirmani and Kuang-Huei Lee and Montserrat Gonzalez Arenas and Hao-Tien Lewis Chiang and Tom Erez and Leonard Hasenclever and Jan Humplik and brian ichter and Ted Xiao and Peng Xu and Andy Zeng and Tingnan Zhang and Nicolas Heess and Dorsa Sadigh and Jie Tan and Yuval Tassa and Fei Xia},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=SgTPdyehXMA}\n}", "github": "", "project": "", "reviewers": "ba4p;NCUb;8cwu", "site": "https://openreview.net/forum?id=SgTPdyehXMA", "pdf_size": 0, "rating": "4;6;6", "confidence": "3;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 20, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 326, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18126079343828260538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;1;0;0;0;0;0;0;2;0;0;0;0", "aff_unique_norm": "Google;DeepMind;Stanford University", "aff_unique_dep": "Google;DeepMind;", "aff_unique_url": "https://www.google.com;https://deepmind.com;https://www.stanford.edu", "aff_unique_abbr": "Google;DeepMind;Stanford", "aff_campus_unique_index": "0;0;0;0;0;0;2;0;0;0;0", "aff_campus_unique": "Mountain View;;Stanford", "aff_country_unique_index": "0;1;0;0;0;1;1;1;0;0;0;1;0;0;0;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "TWgoGdubPN", "title": "Enabling Efficient, Reliable Real-World Reinforcement Learning with Approximate Physics-Based Models", "track": "main", "status": "Poster", "tldr": "We use an approximate physics-based model to design better policy gradient estimators and policy architectures to enable efficient real-world robot learning", "abstract": "We focus on developing efficient and reliable policy optimization strategies for robot learning with real-world data. \nIn recent years, policy gradient methods have emerged as a promising paradigm for training control policies in simulation. \nHowever, these approaches often remain too data inefficient or unreliable to train on real robotic hardware. In this paper we introduce a novel policy gradient-based policy optimization framework which systematically leverages a (possibly highly simplified) first-principles model and enables learning precise control policies with limited amounts of real-world data. Our approach $1)$ uses the derivatives of the model to produce sample-efficient estimates of the policy gradient and $2)$ uses the model to design a low-level tracking controller, which is embedded in the policy class. Theoretical analysis provides insight into how the presence of this feedback controller addresses overcomes key limitations of stand-alone policy gradient methods, while hardware experiments with a small car and quadruped demonstrate that our approach can learn precise control strategies reliably and with only minutes of real-world data.", "keywords": "Model-based Reinforcement Learning;Feedback Control;Quadrupedal Locomotion", "primary_area": "", "supplementary_material": "/attachment/35feb0b7d27ed2bd3561ab51fe61d52447194efa.zip", "author": "Tyler Westenbroek;Jacob Levy;David Fridovich-Keil", "authorids": "~Tyler_Westenbroek1;~Jacob_Levy1;~David_Fridovich-Keil1", "gender": "M;M;M", "homepage": "https://scholar.google.com/citations?user=aqSKwDQAAAAJ&hl=en;;https://dfridovi.github.io", "dblp": ";;203/5260", "google_scholar": ";LLmcf-oAAAAJ;gqyTnpQAAAAJ", "orcid": ";;", "linkedin": ";jacob-levy-13b85069/;", "or_profile": "~Tyler_Westenbroek1;~Jacob_Levy1;~David_Fridovich-Keil1", "aff": ";University of Texas at Austin;University of Texas at Austin", "aff_domain": ";utexas.edu;utexas.edu", "position": ";PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwestenbroek2023enabling,\ntitle={Enabling Efficient, Reliable Real-World Reinforcement Learning with Approximate Physics-Based Models},\nauthor={Tyler Westenbroek and Jacob Levy and David Fridovich-Keil},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=TWgoGdubPN}\n}", "github": "https://github.com/CLeARoboticsLab/LearningWithSimpleModels.jl", "project": "", "reviewers": "D3r9;9dCJ;wfR7;wqLE", "site": "https://openreview.net/forum?id=TWgoGdubPN", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "3;3;3;4", "rating_avg": 8.0, "confidence_avg": 3.25, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1633826177406692869&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "TgJ8vJUVUBR", "title": "TraCo: Learning Virtual Traffic Coordinator for Cooperation with Multi-Agent Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "This is an article that is applied in the field of autonomous driving and requires the agent to complete the corresponding contribution from the perspective of the team.", "abstract": "Multi-agent reinforcement learning (MARL) has emerged as a popular technique in diverse domains due to its ability to automate system controller design and facilitate continuous intelligence learning. For instance, traffic flow is often trained with MARL to enable intelligent simulations for autonomous driving. However, The existing MARL algorithm only characterizes the relative degree of each agent's contribution to the team, and cannot express the contribution that the team needs from the agent. Especially in the field of autonomous driving, the team changes over time, and the agent needs to act directly according to the needs of the team. To address these limitations, we propose an innovative method inspired by realistic traffic coordinators called the Traffic Coordinator Network (TraCo). Our approach leverages a combination of cross-attention and counterfactual advantage function, allowing us to extract distinctive characteristics of domain agents and accurately quantify the contribution that a team needs from an agent. Through experiments conducted on four traffic tasks, we demonstrate that our method outperforms existing approaches, yielding superior performance. Furthermore, our approach enables the emergence of rich and diverse social behaviors among vehicles within the traffic flow.", "keywords": "autonomous driving;multi-agent reinforcement learning;counterfactual reasoning", "primary_area": "", "supplementary_material": "/attachment/0fab89f7600dc5581ce58e803e96dc939f1d0a6e.zip", "author": "Weiwei Liu;Wei Jing;lingping Gao;Ke Guo;Gang Xu;Yong Liu", "authorids": "~Weiwei_Liu3;~Wei_Jing1;~lingping_Gao1;~Ke_Guo1;wuuya@zju.edu.cn;~Yong_Liu11", "gender": ";;;M;;M", "homepage": "https://april.zju.edu.cn/team/weiwei-liu/;http://weijing.github.io/;https://ieeexplore.ieee.org/author/37089196389;;;https://person.zju.edu.cn/en/yongliu", "dblp": ";;;;;29/4867-7", "google_scholar": ";https://scholar.google.com/citations?hl=en;;1oysro0AAAAJ;;https://scholar.google.com.hk/citations?user=qYcgBbEAAAAJ", "orcid": ";;;;;0000-0003-4822-8939", "linkedin": ";;;;;", "or_profile": "~Weiwei_Liu3;~Wei_Jing1;~lingping_Gao1;~Ke_Guo1;wuuya@zju.edu.cn;~Yong_Liu11", "aff": "Zhejiang University;Alibaba Group;;University of Hong Kong;;Zhejiang University", "aff_domain": "zju.edu.cn;alibaba-inc.com;;hku.hk;;zju.edu.cn", "position": "PhD student;Researcher;;PhD student;;Full Professor", "bibtex": "@inproceedings{\nliu2023traco,\ntitle={TraCo: Learning Virtual Traffic Coordinator for Cooperation with Multi-Agent Reinforcement Learning},\nauthor={Weiwei Liu and Wei Jing and lingping Gao and Ke Guo and Gang Xu and Yong Liu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=TgJ8vJUVUBR}\n}", "github": "", "project": "", "reviewers": "9NNb;TCct;nkFo;qaUa", "site": "https://openreview.net/forum?id=TgJ8vJUVUBR", "pdf_size": 0, "rating": "4;4;6;10", "confidence": "4;3;3;5", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.7385489458759963, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16386536336355677555&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Zhejiang University;Alibaba Group;University of Hong Kong", "aff_unique_dep": ";;", "aff_unique_url": "https://www.zju.edu.cn;https://www.alibaba.com;https://www.hku.hk", "aff_unique_abbr": "ZJU;Alibaba;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "Tka2U40pHz0", "title": "Tuning Legged Locomotion Controllers via Safe Bayesian Optimization", "track": "main", "status": "Poster", "tldr": "A safe learning approach to tune legged locomotion controllers via safe bayesian optimization to address the mismatch between the simplified model used in the control formulation and the real system.", "abstract": "This paper presents a data-driven strategy to streamline the deployment of model-based controllers in legged robotic hardware platforms. Our approach leverages a model-free safe learning algorithm to automate the tuning of control gains, addressing the mismatch between the simplified model used in the control formulation and the real system. This method substantially mitigates the risk of hazardous interactions with the robot by sample-efficiently optimizing parameters within a probably safe region. Additionally, we extend the applicability of our approach to incorporate the different gait parameters as contexts, leading to a safe, sample-efficient exploration algorithm capable of tuning a motion controller for diverse gait patterns. We validate our method through simulation and hardware experiments, where we demonstrate that the algorithm obtains superior performance on tuning a model-based motion controller for multiple gaits safely.", "keywords": "Legged Robotics;Bayesian Optimization;Controller Tuning;Locomotion;Machine Learning;Safe Learning", "primary_area": "", "supplementary_material": "/attachment/8e56d8b3e3f1abcc52f13033170a71a9dc9f6f30.zip", "author": "Daniel Widmer;Dongho Kang;Bhavya Sukhija;Jonas H\u00fcbotter;Andreas Krause;Stelian Coros", "authorids": "~Daniel_Widmer1;~Dongho_Kang1;~Bhavya_Sukhija1;~Jonas_H\u00fcbotter1;~Andreas_Krause1;~Stelian_Coros1", "gender": ";M;M;M;M;M", "homepage": ";https://donghok.me/;;https://jonhue.github.io;https://las.inf.ethz.ch/krausea;http://crl.ethz.ch/index.html", "dblp": ";;312/4742;300/4583;87/1831-1.html;", "google_scholar": ";east0822;;pxi_RkwAAAAJ;https://scholar.google.ch/citations?user=eDHv58AAAAAJ;sX31JjwAAAAJ", "orcid": "0009-0004-5820-2964;;0000-0001-6238-9734;;0000-0001-7260-9673;", "linkedin": ";kangdongho/;;jonhue/;krausea/;", "or_profile": "~Daniel_Widmer1;~Dongho_Kang1;~Bhavya_Sukhija1;~Jonas_H\u00fcbotter1;~Andreas_Krause1;~Stelian_Coros1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETH Zurich;ETH Zurich;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "MS student;PhD student;PhD student;MS student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nwidmer2023tuning,\ntitle={Tuning Legged Locomotion Controllers via Safe Bayesian Optimization},\nauthor={Daniel Widmer and Dongho Kang and Bhavya Sukhija and Jonas H{\\\"u}botter and Andreas Krause and Stelian Coros},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=Tka2U40pHz0}\n}", "github": "https://github.com/lasgroup/gosafeopt", "project": "", "reviewers": "2pRp;2qzt;bsr3;9Nt1", "site": "https://openreview.net/forum?id=Tka2U40pHz0", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;3;4;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 13, "authors#_avg": 6, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3456725330116212677&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "id": "UVARkqnsDd", "title": "ScalableMap: Scalable Map Learning for Online Long-Range Vectorized HD Map Construction", "track": "main", "status": "Poster", "tldr": "We exploit the structural properties of vectorized map elements to address the challenges of accurately detecting map elements at longer ranges.", "abstract": "We propose a novel end-to-end pipeline for online long-range vectorized high-definition (HD) map construction using on-board camera sensors. The vectorized representation of HD maps, employing polylines and polygons to represent map elements, is widely used by downstream tasks. However, previous schemes designed with reference to dynamic object detection overlook the structural constraints within linear map elements, resulting in performance degradation in long-range scenarios. In this paper, we exploit the properties of map elements to improve the performance of map construction. We extract more accurate bird\u2019s eye view (BEV) features guided by their linear structure, and then propose a hierarchical sparse map representation to further leverage the scalability of vectorized map elements, and design a progressive decoding mechanism and a supervision strategy based on this representation. Our approach, ScalableMap, demonstrates superior performance on the nuScenes dataset, especially in long-range scenarios, surpassing previous state-of-the-art model by 6.5 mAP while achieving 18.3 FPS.", "keywords": "Map Construction;Multi-view Perception;Long-range Perception", "primary_area": "", "supplementary_material": "/attachment/f09da037e1d4edc8d1cf385b0dbeb998153c86a0.zip", "author": "Jingyi Yu;Zizhao Zhang;Shengfu Xia;Jizhang Sang", "authorids": "~Jingyi_Yu6;~Zizhao_Zhang7;~Shengfu_Xia1;~Jizhang_Sang1", "gender": "F;;M;M", "homepage": ";https://github.com/shfxia;http://main.sgg.whu.edu.cn;https://3zair.github.io/", "dblp": ";;;", "google_scholar": ";;;", "orcid": ";;;", "linkedin": "jingyiyu-whu/;;;", "or_profile": "~Jingyi_Yu6;~Shengfu_Xia1;~Jizhang_Sang1;~\u5b50\u948a_\u5f201", "aff": "Wuhan University;Wuhan University;Wuhan University;Wuhan University", "aff_domain": "whu.edu.cn;whu.edu.cn;whu.edu.cn;whu.edu.cn", "position": "MS student;PhD student;Full Professor;MS student", "bibtex": "@inproceedings{\nyu2023scalablemap,\ntitle={ScalableMap: Scalable Map Learning for Online Long-Range Vectorized {HD} Map Construction},\nauthor={Jingyi Yu and Zizhao Zhang and Shengfu Xia and Jizhang Sang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=UVARkqnsDd}\n}", "github": "https://github.com/jingy1yu/ScalableMap", "project": "", "reviewers": "Z2cB;8yNs;3QMy;f6RE", "site": "https://openreview.net/forum?id=UVARkqnsDd", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;5;4;4", "rating_avg": 5.5, "confidence_avg": 4.25, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2792052997126731078&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Wuhan University", "aff_unique_dep": "", "aff_unique_url": "http://www.whu.edu.cn/", "aff_unique_abbr": "WHU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "UZpWSDA3tZJ", "title": "Towards General Single-Utensil Food Acquisition with Human-Informed Actions", "track": "main", "status": "Poster", "tldr": "Distilling a structured action space a priori from human demonstrations can lead to faster online learning as demonstrated with robotic food manipulation.", "abstract": "Food acquisition with common general-purpose utensils is a necessary component of robot applications like in-home assistive feeding. Learning acquisition policies in this space is difficult in part because any model will need to contend with extensive state and actions spaces. Food is extremely diverse and generally difficult to simulate, and acquisition actions like skewers, scoops, wiggles, and twirls can be parameterized in myriad ways. However, food's visual diversity can belie a degree of physical homogeneity, and many foods allow flexibility in how they are acquired. Due to these facts, our key insight is that a small subset of actions is sufficient to acquire a wide variety of food items. In this work, we present a methodology for identifying such a subset from limited human trajectory data. We first develop an over-parameterized action space of robot acquisition trajectories that capture the variety of human food acquisition technique. By mapping human trajectories into this space and clustering, we construct a discrete set of 11 actions. We demonstrate that this set is capable of acquiring a variety of food items with $\\geq80\\%$ success rate, a rate that users have said is sufficient for in-home robot-assisted feeding. Furthermore, since this set is so small, we also show that we can use online learning to determine a sufficiently optimal action for a previously-unseen food item over the course of a single meal.", "keywords": "Manipulation;Learning from Demonstration;Assistive Robotics", "primary_area": "", "supplementary_material": "/attachment/c331fba3f8de7fd62be3cb3eb64fbe22990c8247.zip", "author": "Ethan Kroll Gordon;Amal Nanavati;Ramya Challa;Bernie Hao Zhu;Taylor Annette Kessler Faulkner;Siddhartha Srinivasa", "authorids": "~Ethan_Kroll_Gordon1;~Amal_Nanavati1;~Ramya_Challa1;~Bernie_Hao_Zhu1;~Taylor_Annette_Kessler_Faulkner1;~Siddhartha_Srinivasa1", "gender": "Not Specified;;F;;;M", "homepage": "https://ethankgordon.com;;;;;https://goodrobot.ai", "dblp": "242/8139;;;;;", "google_scholar": "hfmi3QwAAAAJ;68V-nNAAAAAJ;;;;https://scholar.google.com.tw/citations?user=RCi98EAAAAAJ", "orcid": "0000-0003-1621-2342;0000-0001-5380-7834;0000-0002-6262-5132;;;", "linkedin": "https://linkedin.com/in/ekgordon;amalnanavati;;;;", "or_profile": "~Ethan_Kroll_Gordon1;~Amal_Nanavati1;~Ramya_Challa1;~Bernie_Hao_Zhu1;~Taylor_Annette_Kessler_Faulkner1;~Siddhartha_Srinivasa1", "aff": "Department of Computer Science, University of Washington;Department of Computer Science;Oregon State University;;;University of Washington", "aff_domain": "cs.washington.edu;cs.washington.edu;oregonstate.edu;;;washington.edu", "position": "PhD student;PhD student;PhD student;;;Full Professor", "bibtex": "@inproceedings{\ngordon2023towards,\ntitle={Towards General Single-Utensil Food Acquisition with Human-Informed Actions},\nauthor={Ethan Kroll Gordon and Amal Nanavati and Ramya Challa and Bernie Hao Zhu and Taylor Annette Kessler Faulkner and Siddhartha Srinivasa},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=UZpWSDA3tZJ}\n}", "github": "https://github.com/personalrobotics/corl23_towards_general_food_acquisition (implementation: https://github.com/personalrobotics/ada_feeding)", "project": "", "reviewers": "W1Qt;tu76;tE7u;kQNo", "site": "https://openreview.net/forum?id=UZpWSDA3tZJ", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;2;4", "rating_avg": 6.0, "confidence_avg": 3.5, "replies_avg": 15, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4132725931113214081&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "University of Washington;Unknown Institution;Oregon State University", "aff_unique_dep": "Department of Computer Science;Department of Computer Science;", "aff_unique_url": "https://www.washington.edu;;https://oregonstate.edu", "aff_unique_abbr": "UW;;OSU", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "VH6WIPF4Sj", "title": "Predicting Object Interactions with Behavior Primitives: An Application in Stowing Tasks", "track": "main", "status": "Oral", "tldr": "", "abstract": "Stowing, the task of placing objects in cluttered shelves or bins, is a common task in warehouse and manufacturing operations. However, this task is still predominantly carried out by human workers as stowing is challenging to automate due to the complex multi-object interactions and long-horizon nature of the task. Previous works typically involve extensive data collection and costly human labeling of semantic priors across diverse object categories. This paper presents a method to learn a generalizable robot stowing policy from predictive model of object interactions and a single demonstration with behavior primitives. We propose a novel framework that utilizes Graph Neural Networks (GNNs) to predict object interactions within the parameter space of behavioral primitives. We further employ primitive-augmented trajectory optimization to search the parameters of a predefined library of heterogeneous behavioral primitives to instantiate the control action. Our framework enables robots to proficiently execute long-horizon stowing tasks with a few keyframes (3-4) from a single demonstration. Despite being solely trained in a simulation, our framework demonstrates remarkable generalization capabilities. It efficiently adapts to a broad spectrum of real-world conditions, including various shelf widths, fluctuating quantities of objects, and objects with diverse attributes such as sizes and shapes.", "keywords": "Robotic Manipulation;Model Learning;Graph-Based Neural Dynamics;Multi-Object Interactions", "primary_area": "", "supplementary_material": "/attachment/060bad864d2b4d0561745ac261a2c5c3b7a1f607.zip", "author": "Haonan Chen;Yilong Niu;Kaiwen Hong;Shuijing Liu;Yixuan Wang;Yunzhu Li;Katherine Rose Driggs-Campbell", "authorids": "~Haonan_Chen6;~Yilong_Niu1;~Kaiwen_Hong1;~Shuijing_Liu1;~Yixuan_Wang2;~Yunzhu_Li1;~Katherine_Rose_Driggs-Campbell1", "gender": ";;M;F;M;M;", "homepage": "https://ece.illinois.edu/about/directory/grad-students/haonan2;https://github.com/YilongNiu;;https://shuijing725.github.io;https://wangyixuan12.github.io/;https://yunzhuli.github.io/;", "dblp": ";;;211/7210;44/4317-3;182/1831;", "google_scholar": ";;;I4k7ukgAAAAJ;https://scholar.google.com/citations?hl=en;WlA92lcAAAAJ;", "orcid": ";;;;0009-0006-6641-4718;;", "linkedin": ";;kaiwen-hong-524520141/;shuijing-liu-4089b3123;yixuan-wang-54298115a;;", "or_profile": "~Haonan_Chen6;~Yilong_Niu1;~Kaiwen_Hong1;~Shuijing_Liu1;~Yixuan_Wang2;~Yunzhu_Li1;~Katherine_Rose_Driggs-Campbell1", "aff": "University of Illinois Urbana-Champaign;University of Illinois Urbana-Champaign;UIUC;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;Stanford University;", "aff_domain": "illinois.edu;illinois.edu;illinois.edu;uiuc.edu;illinois.edu;stanford.edu;", "position": "PhD student;Undergrad student;PhD student;PhD student;MS student;Postdoc;", "bibtex": "@inproceedings{\nchen2023predicting,\ntitle={Predicting Object Interactions with Behavior Primitives: An Application in Stowing Tasks},\nauthor={Haonan Chen and Yilong Niu and Kaiwen Hong and Shuijing Liu and Yixuan Wang and Yunzhu Li and Katherine Rose Driggs-Campbell},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=VH6WIPF4Sj}\n}", "github": "", "project": "", "reviewers": "pqt2;kz8m;qLwL;NnNZ", "site": "https://openreview.net/forum?id=VH6WIPF4Sj", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;5;4;4", "rating_avg": 7.0, "confidence_avg": 4.25, "replies_avg": 17, "authors#_avg": 7, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15569451361805815589&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://illinois.edu;https://www.stanford.edu", "aff_unique_abbr": "UIUC;Stanford", "aff_campus_unique_index": "0;0;0;0;0;1", "aff_campus_unique": "Urbana-Champaign;Stanford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "VLihM67Wdi6", "title": "STERLING: Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience", "track": "main", "status": "Poster", "tldr": "We propose a novel self-supervised terrain representation learning algorithm that can learn relevant representations from unconstrained, unlabelled robot experience. We evaluate it against competitve baselines on a real robot in outdoor environments.", "abstract": "Terrain awareness, i.e., the ability to identify and distinguish different types of terrain, is a critical ability that robots must have to succeed at autonomous off-road navigation. Current approaches that provide robots with this awareness either rely on labeled data which is expensive to collect, engineered features and cost functions that may not generalize, or expert human demonstrations which may not be available. Towards endowing robots with terrain awareness without these limitations, we introduce Self-supervised TErrain Representation LearnING (STERLING), a novel approach for learning terrain representations that relies solely on easy-to-collect, unconstrained (e.g., non-expert), and unlabelled robot experience, with no additional constraints on data collection. STERLING employs a novel multi-modal self-supervision objective through non-contrastive representation learning to learn relevant terrain representations for terrain-aware navigation. Through physical robot experiments in off-road environments, we evaluate STERLING features on the task of preference-aligned visual navigation and find that STERLING features perform on par with fully-supervised approaches and outperform other state-of-the-art methods with respect to preference alignment. Additionally, we perform a large-scale experiment of autonomously hiking a 3-mile long trail which STERLING completes successfully with only two manual interventions, demonstrating its robustness to real-world off-road conditions.", "keywords": "Vision-based Navigation;Representation Learning;Learning from Experience", "primary_area": "", "supplementary_material": "/attachment/fab64bce267dacaa634f070e627fcafb1d5f74f2.zip", "author": "Haresh Karnan;Elvin Yang;Daniel Farkash;Garrett Warnell;Joydeep Biswas;Peter Stone", "authorids": "~Haresh_Karnan1;~Elvin_Yang1;dmf248@cornell.edu;~Garrett_Warnell1;~Joydeep_Biswas1;~Peter_Stone1", "gender": "M;;;M;M;M", "homepage": ";;;;https://www.joydeepb.com/;http://www.cs.utexas.edu/~pstone", "dblp": ";;;173/5902;84/73;s/PeterStone", "google_scholar": ";kSzfzv8AAAAJ;;Ndp8dmgAAAAJ;https://scholar.google.com.tw/citations?user=f28F1YUAAAAJ;qnwjcfAAAAAJ", "orcid": ";0009-0007-6426-2252;;;0000-0002-1211-1731;0000-0002-6795-420X", "linkedin": ";;;;;", "or_profile": "~Haresh_Karnan1;~Elvin_Yang1;dmf248@cornell.edu;~Garrett_Warnell1;~Joydeep_Biswas1;~Peter_Stone1", "aff": "University of Texas, Austin;University of Texas at Austin;;Army Research Laboratory;The University of Texas at Austin;University of Texas, Austin", "aff_domain": "utexas.edu;cs.utexas.edu;;army.mil;cs.utexas.edu;utexas.edu", "position": "PhD student;Undergrad student;;Research Scientist;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nkarnan2023sterling,\ntitle={{STERLING}: Self-Supervised Terrain Representation Learning from Unconstrained Robot Experience},\nauthor={Haresh Karnan and Elvin Yang and Daniel Farkash and Garrett Warnell and Joydeep Biswas and Peter Stone},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=VLihM67Wdi6}\n}", "github": "https://github.com/HareshKarnan/sterling_corl23", "project": "", "reviewers": "DHLH;oD4L;jqM5;f3rt", "site": "https://openreview.net/forum?id=VLihM67Wdi6", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;4;4;4", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 16, "authors#_avg": 6, "corr_rating_confidence": -0.6622661785325219, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5489824725647996918&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "University of Texas at Austin;Army Research Laboratory", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.arl.army.mil", "aff_unique_abbr": "UT Austin;ARL", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "VscdYkKgwdH", "title": "Neural Graph Control Barrier Functions Guided Distributed Collision-avoidance Multi-agent Control", "track": "main", "status": "Poster", "tldr": "We introduce a new notion of GCBF to encode inter-agent collision and obstacle avoidance in control for large-scale multi-agent systems with LiDAR-based observations, and jointly learn it with a distributed controller using GNNs.", "abstract": "We consider the problem of designing distributed collision-avoidance multi-agent control in large-scale environments with potentially moving obstacles, where a large number of agents are required to maintain safety using only local information and reach their goals. This paper addresses the problem of collision avoidance, scalability, and generalizability by introducing graph control barrier functions (GCBFs) for distributed control. The newly introduced GCBF is based on the well-established CBF theory for safety guarantees but utilizes a graph structure for scalable and generalizable decentralized control. We use graph neural networks to learn both neural a GCBF certificate and distributed control. We also extend the framework from handling state-based models to directly taking point clouds from LiDAR for more practical robotics settings. We demonstrated the efficacy of GCBF in a variety of numerical experiments, where the number, density, and traveling distance of agents, as well as the number of unseen and uncontrolled obstacles increase. Empirical results show that GCBF outperforms leading methods such as MAPPO and multi-agent distributed CBF (MDCBF). Trained with only $16$ agents, GCBF can achieve up to $3$ times improvement of success rate (agents reach goals and never encountered in any collisions) on $<500$ agents, and still maintain more than $50\\%$ success rates for $>\\!1000$ agents when other methods completely fail.", "keywords": "Distributed control;Control barrier functions;Graph neural networks", "primary_area": "", "supplementary_material": "/attachment/f219ada7a4a6ad8d161fd5b44fe80777dc693e48.zip", "author": "Songyuan Zhang;Kunal Garg;Chuchu Fan", "authorids": "~Songyuan_Zhang1;~Kunal_Garg1;~Chuchu_Fan2", "gender": "M;M;F", "homepage": "https://syzhang092218-source.github.io;https://search.asu.edu/profile/5172311;https://chuchu.mit.edu", "dblp": "128/0456;;127/1756", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;vs3pl-8AAAAJ;J-dq_8EAAAAJ", "orcid": ";;", "linkedin": ";;chuchu-fan/", "or_profile": "~Songyuan_Zhang1;~Kunal_Garg1;~Chuchu_Fan2", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023neural,\ntitle={Neural Graph Control Barrier Functions Guided Distributed Collision-avoidance Multi-agent Control},\nauthor={Songyuan Zhang and Kunal Garg and Chuchu Fan},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=VscdYkKgwdH}\n}", "github": "", "project": "", "reviewers": "QaEJ;pBaq;dLpN;fBfM;umVL", "site": "https://openreview.net/forum?id=VscdYkKgwdH", "pdf_size": 0, "rating": "4;6;6;6;6", "confidence": "4;4;4;2;5", "rating_avg": 5.6, "confidence_avg": 3.8, "replies_avg": 22, "authors#_avg": 3, "corr_rating_confidence": -0.1020620726159658, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6812242593536293444&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "VtJqMs9ig20", "title": "CAT: Closed-loop Adversarial Training for Safe End-to-End Driving", "track": "main", "status": "Poster", "tldr": "", "abstract": "Driving safety is a top priority for autonomous vehicles. Orthogonal to prior work handling accident-prone traffic events by algorithm designs at the policy level, we investigate a \\textbf{C}losed-loop \\textbf{A}dversarial \\textbf{T}raining (CAT) framework for safe end-to-end driving in this paper through the lens of environment augmentation. CAT aims to continuously improve the safety of driving agents by training the agent on safety-critical scenarios that are dynamically generated over time. A novel resampling technique is developed to turn log-replay real-world driving scenarios into safety-critical ones via probabilistic factorization, where the adversarial traffic generation is modeled as the multiplication of standard motion prediction sub-problems. Consequently, CAT can launch more efficient physical attacks compared to existing safety-critical scenario generation methods and yields a significantly less computational cost in the iterative learning pipeline. We incorporate CAT into the MetaDrive simulator and validate our approach on hundreds of driving scenarios imported from real-world driving datasets. Experimental results demonstrate that CAT can effectively generate adversarial scenarios countering the agent being trained. After training, the agent can achieve superior driving safety in both log-replay and safety-critical traffic scenarios on the held-out test set. Code and data are available at: https://metadriverse.github.io/cat", "keywords": "Safety-Critical Scenario Generation;Adversarial Training;End-to-End Driving", "primary_area": "", "supplementary_material": "/attachment/a93ddcf2cb6b2e5b486cde1787c793713c20ded0.zip", "author": "Linrui Zhang;Zhenghao Peng;Quanyi Li;Bolei Zhou", "authorids": "~Linrui_Zhang1;~Zhenghao_Peng1;~Quanyi_Li1;~Bolei_Zhou5", "gender": "M;M;M;M", "homepage": ";https://pengzhenghao.github.io;https://quanyili.github.io;https://boleizhou.github.io/", "dblp": ";220/3963;270/7691;46/8066", "google_scholar": ";JZ8ws6IAAAAJ;Ty49X3UAAAAJ;9D4aG8AAAAAJ", "orcid": ";;;", "linkedin": "%E9%BA%9F%E7%9D%BF-%E5%BC%A0-bb5312222/;;https://www.linkedin.com/mwlite/in/quanyi-li-2b7985183;", "or_profile": "~Linrui_Zhang1;~Zhenghao_Peng1;~Quanyi_Li1;~Bolei_Zhou5", "aff": "Tsinghua University;University of California, Los Angeles;Shanghai Artificial Intelligence Laboratory;University of California, Los Angeles", "aff_domain": "mails.tsinghua.edu.cn;cs.ucla.edu;pjlab.org.cn;ucla.edu", "position": "MS student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023cat,\ntitle={{CAT}: Closed-loop Adversarial Training for Safe End-to-End Driving},\nauthor={Linrui Zhang and Zhenghao Peng and Quanyi Li and Bolei Zhou},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=VtJqMs9ig20}\n}", "github": "https://github.com/metadriverse/cat", "project": "", "reviewers": "s5WY;P4zB;9tii;hXmh", "site": "https://openreview.net/forum?id=VtJqMs9ig20", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "3;3;3;4", "rating_avg": 7.0, "confidence_avg": 3.25, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7896781053192974562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Tsinghua University;University of California, Los Angeles;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucla.edu;http://www.shailab.org/", "aff_unique_abbr": "THU;UCLA;Shanghai AI Lab", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "China;United States" }, { "id": "VtUZ4VGPns", "title": "IIFL: Implicit Interactive Fleet Learning from Heterogeneous Human Supervisors", "track": "main", "status": "Poster", "tldr": "A novel imitation learning algorithm for handling both distribution shift and multimodality.", "abstract": "Imitation learning has been applied to a range of robotic tasks, but can struggle when robots encounter edge cases that are not represented in the training data (i.e., distribution shift). Interactive fleet learning (IFL) mitigates distribution shift by allowing robots to access remote human supervisors during task execution and learn from them over time, but different supervisors may demonstrate the task in different ways. Recent work proposes Implicit Behavior Cloning (IBC), which is able to represent multimodal demonstrations using energy-based models (EBMs). In this work, we propose Implicit Interactive Fleet Learning (IIFL), an algorithm that builds on IBC for interactive imitation learning from multiple heterogeneous human supervisors. A key insight in IIFL is a novel approach for uncertainty quantification in EBMs using Jeffreys divergence. While IIFL is more computationally expensive than explicit methods, results suggest that IIFL achieves a 2.8x higher success rate in simulation experiments and a 4.5x higher return on human effort in a physical block pushing task over (Explicit) IFL, IBC, and other baselines.", "keywords": "Imitation Learning;Fleet Learning;Energy-Based Models", "primary_area": "", "supplementary_material": "/attachment/86b525769b09ff8ba75de1272e9527b93582aac1.zip", "author": "Gaurav Datta;Ryan Hoque;Anrui Gu;Eugen Solowjow;Ken Goldberg", "authorids": "~Gaurav_Datta1;~Ryan_Hoque1;~Anrui_Gu1;eugen.solowjow@siemens.com;~Ken_Goldberg1", "gender": ";M;F;;M", "homepage": ";https://ryanhoque.github.io;https://anruigu.github.io;;http://goldberg.berkeley.edu/", "dblp": ";250/9457;;;g/KennethYGoldberg", "google_scholar": ";ywv6tDUAAAAJ;;;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ", "orcid": ";;;;0000-0001-6747-9499", "linkedin": "https://linkedin.com/in/gaurav-datta;https://linkedin.com/in/ryanhoque;anruigu/;;goldbergken/", "or_profile": "~Gaurav_Datta1;~Ryan_Hoque1;~Anrui_Gu1;eugen.solowjow@siemens.com;~Ken_Goldberg1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;;berkeley.edu", "position": "Undergrad student;PhD student;Undergrad student;;Full Professor", "bibtex": "@inproceedings{\ndatta2023iifl,\ntitle={{IIFL}: Implicit Interactive Fleet Learning from Heterogeneous Human Supervisors},\nauthor={Gaurav Datta and Ryan Hoque and Anrui Gu and Eugen Solowjow and Ken Goldberg},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=VtUZ4VGPns}\n}", "github": "https://github.com/BerkeleyAutomation/IIFL", "project": "", "reviewers": "XAiu;z8az;9GiX", "site": "https://openreview.net/forum?id=VtUZ4VGPns", "pdf_size": 0, "rating": "6;6;10", "confidence": "3;5;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9213445923323455444&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "W0zgY2mBTA8", "title": "ChainedDiffuser: Unifying Trajectory Diffusion and Keypose Prediction for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "We present ChainedDiffuser, a policy architecture that unifies transformer-based end-effector action prediction and diffusion-based trajectory generation for learning robotic manipulation policies from demonstrations.", "abstract": "We present ChainedDiffuser, a policy architecture that unifies action keypose prediction and trajectory diffusion generation for learning robot manipulation from demonstrations. Our main innovation is to use a global transformer-based action predictor to predict actions at keyframes, a task that requires multi- modal semantic scene understanding, and to use a local trajectory diffuser to predict trajectory segments that connect predicted macro-actions. ChainedDiffuser sets a new record on established manipulation benchmarks, and outperforms both state-of-the-art keypose (macro-action) prediction models that use motion plan- ners for trajectory prediction, and trajectory diffusion policies that do not predict keyframe macro-actions. We conduct experiments in both simulated and real-world environments and demonstrate ChainedDiffuser\u2019s ability to solve a wide range of manipulation tasks involving interactions with diverse objects.", "keywords": "Manipulation;Imitation Learning;Transformers;Diffusion Models", "primary_area": "", "supplementary_material": "/attachment/0cf85b031abce47d6d7687210c89b016882bab6e.zip", "author": "Zhou Xian;Nikolaos Gkanatsios;Theophile Gervet;Tsung-Wei Ke;Katerina Fragkiadaki", "authorids": "~Zhou_Xian1;~Nikolaos_Gkanatsios1;~Theophile_Gervet1;~Tsung-Wei_Ke2;~Katerina_Fragkiadaki1", "gender": "M;M;M;;F", "homepage": ";https://nickgkan.github.io/;https://theophilegervet.github.io;https://twke18.github.io/;https://www.cs.cmu.edu/~katef/", "dblp": "258/5020;225/5677;;173/4984;21/8780", "google_scholar": ";https://scholar.google.gr/citations?user=jk7GqOEAAAAJ;-o8kQPwAAAAJ;WTEFsHMAAAAJ;FWp7728AAAAJ", "orcid": ";;;;", "linkedin": ";;theophile-gervet/;;", "or_profile": "~Zhou_Xian1;~Nikolaos_Gkanatsios1;~Theophile_Gervet1;~Tsung-Wei_Ke2;~Katerina_Fragkiadaki1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu;andrew.cmu.edu;cmu.edu", "position": "PhD student;Graduate student;PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nxian2023chaineddiffuser,\ntitle={ChainedDiffuser: Unifying Trajectory Diffusion and Keypose Prediction for Robotic Manipulation},\nauthor={Zhou Xian and Nikolaos Gkanatsios and Theophile Gervet and Tsung-Wei Ke and Katerina Fragkiadaki},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=W0zgY2mBTA8}\n}", "github": "https://github.com/zhouxian/chained-diffuser", "project": "", "reviewers": "JWec;BFkn;zryn", "site": "https://openreview.net/forum?id=W0zgY2mBTA8", "pdf_size": 0, "rating": "4;6;6", "confidence": "4;4;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16645345674778263594&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "W5SrUCN0yUa", "title": "A Bayesian Approach to Robust Inverse Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We consider a Bayesian approach to offline model-based inverse reinforcement learning (IRL). The proposed framework differs from existing offline model-based IRL approaches by performing simultaneous estimation of the expert's reward function and subjective model of environment dynamics. We make use of a class of prior distributions which parameterizes how accurate the expert\u2019s model of the environment is to develop efficient algorithms to estimate the expert's reward and subjective dynamics in high-dimensional settings. Our analysis reveals a novel insight that the estimated policy exhibits robust performance when the expert is believed (a priori) to have a highly accurate model of the environment. We verify this observation in the MuJoCo environments and show that our algorithms outperform state-of-the-art offline IRL algorithms.", "keywords": "Inverse Reinforcement Learning;Bayesian Inference;Robustness", "primary_area": "", "supplementary_material": "/attachment/3052f7fc08eb591c4fea7c36412acfd3618ff8bd.zip", "author": "Ran Wei;Siliang Zeng;Chenliang Li;Alfredo Garcia;Anthony D McDonald;Mingyi Hong", "authorids": "~Ran_Wei4;~Siliang_Zeng1;~Chenliang_Li3;~Alfredo_Garcia1;~Anthony_D_McDonald1;~Mingyi_Hong1", "gender": "Not Specified;M;M;M;M;M", "homepage": "https://scholar.google.com/citations?user=gPoxkOUAAAAJ&hl=en;https://siliangzeng.github.io/index.html;;https://agarcia.engr.tamu.edu;https://directory.engr.wisc.edu/ie/Faculty/Mcdonald_Tony/;http://people.ece.umn.edu/~mhong/mingyi.html", "dblp": ";38/9;;;;57/8053", "google_scholar": ";IfqsDyYAAAAJ;;;14TnSJIAAAAJ;qRnP-p0AAAAJ", "orcid": ";;;;;", "linkedin": ";;https://www.linkedin.cn/incareer/in/%E7%90%9B%E8%89%AF-%E6%9D%8E-5a333a23b;;;", "or_profile": "~Ran_Wei4;~Siliang_Zeng1;~Chenliang_Li3;~Alfredo_Garcia1;~Anthony_D_McDonald1;~Mingyi_Hong1", "aff": "Texas A&M University - College Station;University of Minnesota, Twin Cities;The Chinese University of Hong Kong;Texas A&M University - College Station;University of Wisconsin - Madison;University of Minnesota, Minneapolis", "aff_domain": "tamu.edu;umn.edu;cuhk.edu.hk;tamu.edu;wisc.edu;umn.edu", "position": "PhD student;PhD student;MS student;Full Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwei2023a,\ntitle={A Bayesian Approach to Robust Inverse Reinforcement Learning},\nauthor={Ran Wei and Siliang Zeng and Chenliang Li and Alfredo Garcia and Anthony D McDonald and Mingyi Hong},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=W5SrUCN0yUa}\n}", "github": "https://github.com/rw422scarlet/bmirl_tf", "project": "", "reviewers": "yDmY;agyf;zKpN;oFYH", "site": "https://openreview.net/forum?id=W5SrUCN0yUa", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;3;5", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 14, "authors#_avg": 6, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=67123873944400519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0;3;1", "aff_unique_norm": "Texas A&M University;University of Minnesota;Chinese University of Hong Kong;University of Wisconsin-Madison", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.tamu.edu;https://www.minnesota.edu;https://www.cuhk.edu.hk;https://www.wisc.edu", "aff_unique_abbr": "TAMU;UMN;CUHK;UW-Madison", "aff_campus_unique_index": "0;1;2;0;3;4", "aff_campus_unique": "College Station;Twin Cities;Hong Kong SAR;Madison;Minneapolis", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "W7eg2NqFJ60", "title": "Transforming a Quadruped into a Guide Robot for the Visually Impaired: Formalizing Wayfinding, Interaction Modeling, and Safety Mechanism", "track": "main", "status": "Poster", "tldr": "This paper discusses principles and practical solutions for developing a robot guide dog, which needs to learn how to safely guide humans like a real guide dog.", "abstract": "This paper explores the principles for transforming a quadrupedal robot into a guide robot for individuals with visual impairments. A guide robot has great potential to resolve the limited availability of guide animals that are accessible to only two to three percent of the potential blind or visually impaired (BVI) users. To build a successful guide robot, our paper explores three key topics: (1) formalizing the navigation mechanism of a guide dog and a human, (2) developing a data-driven model of their interaction, and (3) improving user safety. First, we formalize the wayfinding task of the human-guide robot team using Markov Decision Processes based on the literature and interviews. Then we collect real human-robot interaction data from three visually impaired and six sighted people and develop an interaction model called the \"Delayed Harness\" to effectively simulate the navigation behaviors of the team. Additionally, we introduce an action shielding mechanism to enhance user safety by predicting and filtering out dangerous actions. We evaluate the developed interaction model and the safety mechanism in simulation, which greatly reduce the prediction errors and the number of collisions, respectively. We also demonstrate the integrated system on an AlienGo robot with a rigid harness, by guiding users over 100+ meter trajectories.", "keywords": "Assistive Robot;Autonomous Navigation;Interaction Modeling", "primary_area": "", "supplementary_material": "/attachment/31bb2a19eba3631af2391e7e065667bf067e3ffc.zip", "author": "J. Taery Kim;Wenhao Yu;Yash Kothari;Bruce Walker;Jie Tan;Greg Turk;Sehoon Ha", "authorids": "~J._Taery_Kim1;~Wenhao_Yu1;~Yash_Kothari1;~Bruce_Walker1;~Jie_Tan1;~Greg_Turk1;~Sehoon_Ha2", "gender": "M;M;M;M;;M;", "homepage": "https://wenhaoyu.weebly.com/;;http://sonify.psych.gatech.edu/~walkerb/;http://www.jie-tan.net;;https://www.cc.gatech.edu/~sha9;https://delico123.github.io", "dblp": ";;;81/7419;t/GregTurk;33/10491;220/5761", "google_scholar": "1bF2s2kAAAAJ;;https://scholar.google.com.tw/citations?user=xY3zIEAAAAAJ;neGbgzYAAAAJ;;Q6F3O0sAAAAJ;https://scholar.google.co.kr/citations?user=LWfBEPsAAAAJ", "orcid": ";;0000-0001-8503-5621;;;;", "linkedin": ";yash-kothari-5114b31b3/;brucenwalker/;jie-tan/;;;", "or_profile": "~Wenhao_Yu1;~Yash_Kothari1;~Bruce_Walker1;~Jie_Tan1;~Greg_Turk1;~Sehoon_Ha2;~Joanne_Kim1", "aff": "Google;Georgia Institute of Technology;Georgia Institute of Technology;Google;;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "google.com;gatech.edu;gatech.edu;google.com;;gatech.edu;gatech.edu", "position": "Software Engineer;Undergrad student;Full Professor;Research Scientist;;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nkim2023transforming,\ntitle={Transforming a Quadruped into a Guide Robot for the Visually Impaired: Formalizing Wayfinding, Interaction Modeling, and Safety Mechanism},\nauthor={J. Taery Kim and Wenhao Yu and Yash Kothari and Bruce Walker and Jie Tan and Greg Turk and Sehoon Ha},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=W7eg2NqFJ60}\n}", "github": "", "project": "", "reviewers": "SGRs;1S13;dd3b", "site": "https://openreview.net/forum?id=W7eg2NqFJ60", "pdf_size": 0, "rating": "4;6;10", "confidence": "4;3;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 7, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4051769918350318575&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;0;1;1", "aff_unique_norm": "Google;Georgia Institute of Technology", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.gatech.edu", "aff_unique_abbr": "Google;Georgia Tech", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "W8MjsxHrDpL", "title": "Synthesizing Navigation Abstractions for Planning with Portable Manipulation Skills", "track": "main", "status": "Poster", "tldr": "We present an efficient approach to learning transferable abstractions for task planning with mobile manipulators.", "abstract": "We address the problem of efficiently learning high-level abstractions for task-level robot planning. Existing approaches require large amounts of data and fail to generalize learned abstractions to new environments. To address this, we propose to exploit the independence between spatial and non-spatial state variables in the preconditions of manipulation and navigation skills, mirroring the manipulation-navigation split in robotics research. Given a collection of portable manipulation abstractions (i.e., object-centric manipulation skills paired with matching symbolic representations), we derive an algorithm to automatically generate navigation abstractions that support mobile manipulation planning in a novel environment. We apply our approach to simulated data in AI2Thor and on real robot hardware with a coffee preparation task, efficiently generating plannable representations for mobile manipulators in just a few minutes of robot time, significantly outperforming state-of-the-art baselines.", "keywords": "Learning Abstractions;Mobile Manipulation", "primary_area": "", "supplementary_material": "/attachment/5459611add030cec16fd6529feac01b003890b1a.zip", "author": "Eric Rosen;Steven James;Sergio Orozco;Vedant Gupta;Max Merlin;Stefanie Tellex;George Konidaris", "authorids": "~Eric_Rosen1;~Steven_James1;~Sergio_Orozco1;~Vedant_Gupta1;~Max_Merlin1;~Stefanie_Tellex1;~George_Konidaris1", "gender": "M;M;M;M;;F;M", "homepage": "http://cs.brown.edu/people/er35/home.html;;https://github.com/FezTheImmigrant;https://github.com/Mr-vedant-gupta;;https://h2r.cs.brown.edu/;http://cs.brown.edu/people/gdk/", "dblp": "r/EricRosen;195/8202;;368/8560;331/8054;50/3149;56/6762", "google_scholar": "-9rlvZcAAAAJ;;;njUX3U0AAAAJ;n6koQGYAAAAJ;https://scholar.google.com.tw/citations?user=Pd8-ju0AAAAJ;9UERvVEAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;vedant2/;;;", "or_profile": "~Eric_Rosen1;~Steven_James1;~Sergio_Orozco1;~Vedant_Gupta1;~Max_Merlin1;~Stefanie_Tellex1;~George_Konidaris1", "aff": "Brown University;University of the Witwatersrand;Brown University;Brown University;Brown University;, Brown University;Brown University", "aff_domain": "brown.edu;wits.ac.za;brown.edu;brown.edu;brown.edu;cs.brown.edu;brown.edu", "position": "PhD student;Lecturer;MS student;Undergrad student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nrosen2023synthesizing,\ntitle={Synthesizing Navigation Abstractions for Planning with Portable Manipulation Skills},\nauthor={Eric Rosen and Steven James and Sergio Orozco and Vedant Gupta and Max Merlin and Stefanie Tellex and George Konidaris},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=W8MjsxHrDpL}\n}", "github": "https://github.com/ericrosenbrown/aosm_experiments", "project": "", "reviewers": "VDot;Q4vG;zYUj;yMcH", "site": "https://openreview.net/forum?id=W8MjsxHrDpL", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;2;4;3", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 12, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12386062058526661890&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;0;0;0;0;0", "aff_unique_norm": "Brown University;University of the Witwatersrand", "aff_unique_dep": ";", "aff_unique_url": "https://www.brown.edu;https://www.wits.ac.za", "aff_unique_abbr": "Brown;Wits", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0;0;0", "aff_country_unique": "United States;South Africa" }, { "id": "WGSR7HDuHu", "title": "Learning Robot Manipulation from Cross-Morphology Demonstration", "track": "main", "status": "Poster", "tldr": "Generalizing LfD for manipulation to large mismatches between teacher and student morphologies", "abstract": "Some Learning from Demonstrations (LfD) methods handle small mismatches in the action spaces of the teacher and student. Here we address the casewhere the teacher\u2019s morphology is substantially different from that of the student. Our framework, Morphological Adaptation in Imitation Learning (MAIL), bridges this gap allowing us to train an agent from demonstrations by other agents with significantly different morphologies. MAIL learns from suboptimal demonstrations, so long as they provide some guidance towards a desired solution. We demonstrate MAIL on manipulation tasks with rigid and deformable objects including 3D cloth manipulation interacting with rigid obstacles. We train a visual control policy for a robot with one end-effector using demonstrations from a simulated agent with two end-effectors. MAIL shows up to 24% improvement in a normalized performance metric over LfD and non-LfD baselines. It is deployed to a real Franka Panda robot, handles multiple variations in properties for objects (size, rotation, translation), and cloth-specific properties (color, thickness, size, material).", "keywords": "Imitation from Observation;Learning from Demonstration", "primary_area": "", "supplementary_material": "/attachment/c4e33615ed4bf40543f8f2bfb763b7958d4f53b2.zip", "author": "Gautam Salhotra;I-Chun Arthur Liu;Gaurav S. Sukhatme", "authorids": "~Gautam_Salhotra1;~I-Chun_Arthur_Liu1;~Gaurav_S._Sukhatme1", "gender": ";M;M", "homepage": ";http://arthurliu.com/;http://www-robotics.usc.edu/~gaurav/", "dblp": ";;s/GauravSSukhatme", "google_scholar": ";ToWC_fgAAAAJ;https://scholar.google.com.tw/citations?user=lRUi-A8AAAAJ", "orcid": ";0000-0001-7144-634X;0000-0003-2408-474X", "linkedin": ";i-chun-arthur-liu/;gaurav-sukhatme-9b6420b/", "or_profile": "~Gautam_Salhotra1;~I-Chun_Arthur_Liu1;~Gaurav_S._Sukhatme1", "aff": ";University of Southern California;University of Southern California", "aff_domain": ";usc.edu;usc.edu", "position": ";MS student;Full Professor", "bibtex": "@inproceedings{\nsalhotra2023learning,\ntitle={Learning Robot Manipulation from Cross-Morphology Demonstration},\nauthor={Gautam Salhotra and I-Chun Arthur Liu and Gaurav S. Sukhatme},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=WGSR7HDuHu}\n}", "github": "https://github.com/uscresl/mail", "project": "", "reviewers": "UU6D;nE7X;igC1;ExJj", "site": "https://openreview.net/forum?id=WGSR7HDuHu", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;3;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4371348636482025594&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "WWiKBdcpNd", "title": "HANDLOOM: Learned Tracing of One-Dimensional Objects for Inspection and Manipulation", "track": "main", "status": "Oral", "tldr": "Cable State Estimation using an Autoregressive Learned Tracer for Downstream Inspection and Manipulation Tasks", "abstract": "Tracing \u2013 estimating the spatial state of \u2013 long deformable linear objects such as cables, threads, hoses, or ropes, is useful for a broad range of tasks in homes, retail, factories, construction, transportation, and healthcare. For long deformable linear objects (DLOs or simply cables) with many (over 25) crossings, we present HANDLOOM (Heterogeneous Autoregressive Learned Deformable Linear Object Observation and Manipulation) a learning-based algorithm that fits a trace to a greyscale image of cables. We evaluate HANDLOOM on semi-planar DLO configurations where each crossing involves at most 2 segments. HANDLOOM makes use of neural networks trained with 30,000 simulated examples and 568 real examples to autoregressively estimate traces of cables and classify crossings. Experiments find that in settings with multiple identical cables, HANDLOOM can trace each cable with 80% accuracy. In single-cable images, HANDLOOM can trace and identify knots with 77% accuracy. When HANDLOOM is incorporated into a bimanual robot system, it enables state-based imitation of knot tying with 80% accuracy, and it successfully untangles 64% of cable configurations across 3 levels of difficulty. Additionally, HANDLOOM demonstrates generalization to knot types and materials (rubber, cloth rope) not present in the training dataset with 85% accuracy. Supplementary material, including all code and an annotated dataset of RGB-D images of cables along with ground-truth traces, is at https://sites.google.com/view/cable-tracing.", "keywords": "state estimation;deformable manipulation", "primary_area": "", "supplementary_material": "/attachment/165f2a6038cc3b4a9dcb9a4bc8ebe20ef0e68ff1.zip", "author": "Vainavi Viswanath;Kaushik Shivakumar;Mallika Parulekar;Jainil Ajmera;Justin Kerr;Jeffrey Ichnowski;Richard Cheng;Thomas Kollar;Ken Goldberg", "authorids": "~Vainavi_Viswanath1;~Kaushik_Shivakumar1;~Mallika_Parulekar1;~Jainil_Ajmera1;~Justin_Kerr1;~Jeffrey_Ichnowski1;~Richard_Cheng1;~Thomas_Kollar1;~Ken_Goldberg1", "gender": "F;M;;M;M;M;;M;M", "homepage": ";https://kmindspark.github.io/;;https://jainilajmera.com/;https://kerrj.github.io/;https://ichnow.ski;;http://tkollar.github.io;http://goldberg.berkeley.edu/", "dblp": ";;;;;89/1741;03/5484;10/6653;g/KennethYGoldberg", "google_scholar": ";2TTqpGQAAAAJ;;;;1OdtfywAAAAJ;d_Fpj0oAAAAJ;AEKT17QAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ", "orcid": ";;;;;0000-0003-4874-9478;;0000-0003-2598-8118;0000-0001-6747-9499", "linkedin": "vainavi-viswanath/;kaushik-shivakumar/;mallikaparulekar/;jainilajmera/;;;;;goldbergken/", "or_profile": "~Vainavi_Viswanath1;~Kaushik_Shivakumar1;~Mallika_Parulekar1;~Jainil_Ajmera1;~Justin_Kerr1;~Jeffrey_Ichnowski1;~Richard_Cheng1;~Thomas_Kollar1;~Ken_Goldberg1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Carnegie Mellon University;Toyota Research Institute;Toyota Research Institute;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;cmu.edu;tri.global;tri.global;berkeley.edu", "position": "MS student;MS student;Undergrad student;Undergrad student;PhD student;Assistant Professor;Researcher;Principal Researcher;Full Professor", "bibtex": "@inproceedings{\nviswanath2023handloom,\ntitle={{HANDLOOM}: Learned Tracing of One-Dimensional Objects for Inspection and Manipulation},\nauthor={Vainavi Viswanath and Kaushik Shivakumar and Mallika Parulekar and Jainil Ajmera and Justin Kerr and Jeffrey Ichnowski and Richard Cheng and Thomas Kollar and Ken Goldberg},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=WWiKBdcpNd}\n}", "github": "https://github.com/vainaviv/handloom", "project": "", "reviewers": "bwb5;KUAt;XRHn", "site": "https://openreview.net/forum?id=WWiKBdcpNd", "pdf_size": 0, "rating": "6;10;10", "confidence": "3;5;4", "rating_avg": 8.666666666666666, "confidence_avg": 4.0, "replies_avg": 7, "authors#_avg": 9, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9582123979430196487&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;1;2;2;0", "aff_unique_norm": "University of California, Berkeley;Carnegie Mellon University;Toyota Research Institute", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.cmu.edu;https://www.tri.global", "aff_unique_abbr": "UC Berkeley;CMU;TRI", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "WmF-fagWdD", "title": "SCALE: Causal Learning and Discovery of Robot Manipulation Skills using Simulation", "track": "main", "status": "Poster", "tldr": "SCALE is an algorithm for discovering robot manipulation skills through causal interventions in simulation.", "abstract": "We propose SCALE, an approach for discovering and learning a diverse set of interpretable robot skills from a limited dataset. Rather than learning a single skill which may fail to capture all the modes in the data, we first identify the different modes via causal reasoning and learn a separate skill for each of them. Our main insight is to associate each mode with a unique set of causally relevant context variables that are discovered by performing causal interventions in simulation. This enables data partitioning based on the causal processes that generated the data, and then compressed skills that ignore the irrelevant variables can be trained. We model each robot skill as a Regional Compressed Option, which extends the options framework by associating a causal process and its relevant variables with the option. Modeled as the skill Data Generating Region, each causal process is local in nature and hence valid over only a subset of the context space. We demonstrate our approach for two representative manipulation tasks: block stacking and peg-in-hole insertion under uncertainty. Our experiments show that our approach yields diverse skills that are compact, robust to domain shifts, and suitable for sim-to-real transfer.", "keywords": "skill discovery;causal learning;manipulation", "primary_area": "", "supplementary_material": "/attachment/9779116623c3edd778be7558a514d124c7487a35.zip", "author": "Tabitha Edith Lee;Shivam Vats;Siddharth Girdhar;Oliver Kroemer", "authorids": "~Tabitha_Edith_Lee1;~Shivam_Vats1;~Siddharth_Girdhar1;~Oliver_Kroemer1", "gender": "M;M;M;F", "homepage": "https://shivamvats.com/;;https://www.ri.cmu.edu/ri-faculty/oliver-kroemer/;https://tabula-rosa.github.io/", "dblp": "180/9226;;04/7743;", "google_scholar": ";;_tbXjP4AAAAJ;ZD6QUvYAAAAJ", "orcid": ";;;", "linkedin": ";siddharthgir/;;tabithaedith", "or_profile": "~Shivam_Vats1;~Siddharth_Girdhar1;~Oliver_Kroemer1;~Timothy_E_Lee1", "aff": "Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;;cmu.edu;cmu.edu", "position": "PhD student;;Assistant Professor;PhD Student", "bibtex": "@inproceedings{\nlee2023scale,\ntitle={{SCALE}: Causal Learning and Discovery of Robot Manipulation Skills using Simulation},\nauthor={Tabitha Edith Lee and Shivam Vats and Siddharth Girdhar and Oliver Kroemer},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=WmF-fagWdD}\n}", "github": "", "project": "", "reviewers": "H8Po;7iCw;zy8g;yTw8", "site": "https://openreview.net/forum?id=WmF-fagWdD", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;3;4;2", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1649696112907698310&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "WuBv9-IGDUA", "title": "Multi-Resolution Sensing for Real-Time Control with Vision-Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Leveraging sensing modalities across diverse spatial and temporal resolutions can improve performance of robotic manipulation tasks. Multi-spatial resolution sensing provides hierarchical information captured at different spatial scales and enables both coarse and precise motions. Simultaneously multi-temporal resolution sensing enables the agent to exhibit high reactivity and real-time control. In this work, we propose a framework for learning generalizable language-conditioned multi-task policies that utilize sensing at different spatial and temporal resolutions using networks of varying capacities to effectively perform real time control of precise and reactive tasks. We leverage off-the-shelf pretrained vision-language models to operate on low-frequency global features along with small non-pretrained models to adapt to high frequency local feedback. Through extensive experiments in 3 domains (coarse, precise and dynamic manipulation tasks), we show that our approach significantly improves ($2\\times$ on average) over recent multi-task baselines. Further, our approach generalizes well to visual and geometric variations in target objects and to varying interaction forces.", "keywords": "Manipulation;Learning for manipulation", "primary_area": "", "supplementary_material": "/attachment/f3a7c443690417bc0771cc1ff938cf990e636733.zip", "author": "Saumya Saxena;Mohit Sharma;Oliver Kroemer", "authorids": "~Saumya_Saxena1;~Mohit_Sharma1;~Oliver_Kroemer1", "gender": "F;M;M", "homepage": "https://saumyasaxena.github.io;https://mohitsharma0690.github.io/;https://www.ri.cmu.edu/ri-faculty/oliver-kroemer/", "dblp": ";;04/7743", "google_scholar": "zvtzoPgAAAAJ;;_tbXjP4AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Saumya_Saxena1;~Mohit_Sharma1;~Oliver_Kroemer1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nsaxena2023multiresolution,\ntitle={Multi-Resolution Sensing for Real-Time Control with Vision-Language Models},\nauthor={Saumya Saxena and Mohit Sharma and Oliver Kroemer},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=WuBv9-IGDUA}\n}", "github": "", "project": "", "reviewers": "VHHQ;FtkG;pRJN;7F3D", "site": "https://openreview.net/forum?id=WuBv9-IGDUA", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;5", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.9271726499455306, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12937250602839681166&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "X0cmlTh1Vl", "title": "Waypoint-Based Imitation Learning for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "An automatic method for selecting waypoints from demonstrations for performant behavioral cloning.", "abstract": "While imitation learning methods have seen a resurgent interest for robotic manipulation, the well-known problem of compounding errors continues to afflict behavioral cloning (BC). Waypoints can help address this problem by reducing the horizon of the learning problem for BC, and thus, the errors compounded over time. However, waypoint labeling is underspecified, and requires additional human supervision. Can we generate waypoints automatically without any additional human supervision? Our key insight is that if a trajectory segment can be approximated by linear motion, the endpoints can be used as waypoints. We propose Automatic Waypoint Extraction (AWE) for imitation learning, a preprocessing module to decompose a demonstration into a minimal set of waypoints which when interpolated linearly can approximate the trajectory up to a specified error threshold. AWE can be combined with any BC algorithm, and we find that AWE can increase the success rate of state-of-the-art algorithms by up to 25% in simulation and by 4-28% on real-world bimanual manipulation tasks, reducing the decision making horizon by up to a factor of 10. Videos and code are available at https://lucys0.github.io/awe/.", "keywords": "imitation learning;waypoints;long-horizon", "primary_area": "", "supplementary_material": "/attachment/654ec1e0dbd9c00cee5fb0ae5bf941d96a5ba123.zip", "author": "Lucy Xiaoyang Shi;Archit Sharma;Tony Z. Zhao;Chelsea Finn", "authorids": "~Lucy_Xiaoyang_Shi1;~Archit_Sharma1;~Tony_Z._Zhao1;~Chelsea_Finn1", "gender": "F;M;;F", "homepage": "https://lucys0.github.io/;;https://tonyzhaozh.github.io/;https://ai.stanford.edu/~cbfinn/", "dblp": "324/5129;220/3163.html;;131/1783", "google_scholar": ";_0IIzxgAAAAJ;;vfPE6hgAAAAJ", "orcid": ";;;", "linkedin": "lucy-xiaoyang-shi/;;;", "or_profile": "~Lucy_Xiaoyang_Shi1;~Archit_Sharma1;~Tony_Z._Zhao1;~Chelsea_Finn1", "aff": "University of Southern California;Stanford University;Stanford University;Google", "aff_domain": "usc.edu;stanford.edu;stanford.edu;google.com", "position": "Undergrad student;Graduate Student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nshi2023waypointbased,\ntitle={Waypoint-Based Imitation Learning for Robotic Manipulation},\nauthor={Lucy Xiaoyang Shi and Archit Sharma and Tony Z. Zhao and Chelsea Finn},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=X0cmlTh1Vl}\n}", "github": "https://github.com/lucys0/awe", "project": "", "reviewers": "cGVi;3s7U;ZvMK", "site": "https://openreview.net/forum?id=X0cmlTh1Vl", "pdf_size": 0, "rating": "4;6;6", "confidence": "5;5;4", "rating_avg": 5.333333333333333, "confidence_avg": 4.666666666666667, "replies_avg": 8, "authors#_avg": 4, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 59, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18054237598959901266&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of Southern California;Stanford University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.usc.edu;https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "USC;Stanford;Google", "aff_campus_unique_index": "0;1;1;2", "aff_campus_unique": "Los Angeles;Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "X7okQlJz9M", "title": "Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control", "track": "main", "status": "Poster", "tldr": "We design and implement a seeing-eye robot system which can guide a blindfolded human, and respond to directional queues via tugs.", "abstract": "Seeing-eye robots are very useful tools for guiding visually impaired people, potentially producing a huge societal impact given the low availability and high cost of real guide dogs. Although a few seeing-eye robot systems have already been demonstrated, none considered external tugs from humans, which frequently occur in a real guide dog setting. In this paper, we simultaneously train a locomotion controller that is robust to external tugging forces via Reinforcement Learning~(RL), and an external force estimator via supervised learning. The controller ensures stable walking, and the force estimator enables the robot to respond to the external forces from the human. These forces are used to guide the robot to the global goal, which is unknown to the robot, while the robot guides the human around nearby obstacles via a local planner. Experimental results in simulation and on hardware show that our controller is robust to external forces, and our seeing-eye system can accurately detect force direction. We demonstrate our full seeing-eye robot system on a real quadruped robot with a blindfolded human.", "keywords": "seeing-eye robot;robotic guide dog;human-robot interaction;quadruped locomotion", "primary_area": "", "supplementary_material": "/attachment/70c615547cfb07dfe9ea0684d1d008d884a56b2f.zip", "author": "David DeFazio;Eisuke Hirota;Shiqi Zhang", "authorids": "~David_DeFazio1;~Eisuke_Hirota1;~Shiqi_Zhang1", "gender": "M;M;M", "homepage": ";https://ei5uke.github.io/;http://www.cs.binghamton.edu/~szhang/", "dblp": "231/7609;;03/9964-1", "google_scholar": "9mCgbWIAAAAJ;NBvMsAMAAAAJ;D0pzuNoAAAAJ", "orcid": ";;0000-0003-4110-8213", "linkedin": ";eisukeh/;", "or_profile": "~David_DeFazio1;~Eisuke_Hirota1;~Shiqi_Zhang1", "aff": "State University of New York at Binghamton;New York University;State University of New York at Binghamton", "aff_domain": "binghamton.edu;nyu.edu;binghamton.edu", "position": "PhD student;Intern;Assistant Professor", "bibtex": "@inproceedings{\ndefazio2023seeingeye,\ntitle={Seeing-Eye Quadruped Navigation with Force Responsive Locomotion Control},\nauthor={David DeFazio and Eisuke Hirota and Shiqi Zhang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=X7okQlJz9M}\n}", "github": "https://github.com/bu-air-lab/guide_dog", "project": "", "reviewers": "5maN;Pxez;SGaq;wGQP;BKt3", "site": "https://openreview.net/forum?id=X7okQlJz9M", "pdf_size": 0, "rating": "4;6;6;10;10", "confidence": "5;4;4;5;5", "rating_avg": 7.2, "confidence_avg": 4.6, "replies_avg": 18, "authors#_avg": 3, "corr_rating_confidence": 0.408248290463863, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1294899362592446087&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "State University of New York at Binghamton;New York University", "aff_unique_dep": ";", "aff_unique_url": "https://www.binghamton.edu;https://www.nyu.edu", "aff_unique_abbr": "SUNY Binghamton;NYU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Binghamton;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "XEw-cnNsr6", "title": "DATT: Deep Adaptive Trajectory Tracking for Quadrotor Control", "track": "main", "status": "Oral", "tldr": "DATT enables quadrotors to precisely track arbitrary, potentially infeasible trajectories in the presence of large disturbances.", "abstract": "Precise arbitrary trajectory tracking for quadrotors is challenging due to unknown nonlinear dynamics, trajectory infeasibility, and actuation limits. To tackle these challenges, we present DATT, a learning-based approach that can precisely track arbitrary, potentially infeasible trajectories in the presence of large disturbances in the real world. DATT builds on a novel feedforward-feedback-adaptive control structure trained in simulation using reinforcement learning. When deployed on real hardware, DATT is augmented with a disturbance estimator using $\\mathcal{L}_1$ adaptive control in closed-loop, without any fine-tuning. DATT significantly outperforms competitive adaptive nonlinear and model predictive controllers for both feasible smooth and infeasible trajectories in unsteady wind fields, including challenging scenarios where baselines completely fail. Moreover, DATT can efficiently run online with an inference time less than 3.2ms, less than 1/4 of the adaptive nonlinear model predictive control baseline.", "keywords": "Quadrotor;Reinforcement Learning;Adaptive Control", "primary_area": "", "supplementary_material": "/attachment/abfaf8d5d64b9fdbe8e5a5d110c6d728ca880fc7.zip", "author": "Kevin Huang;Rwik Rana;Alexander Spitzer;Guanya Shi;Byron Boots", "authorids": "~Kevin_Huang2;~Rwik_Rana1;~Alexander_Spitzer2;~Guanya_Shi1;~Byron_Boots1", "gender": ";;M;M;", "homepage": "http://kevinhuang8.github.io;;https://alspitz.github.io;http://guanyashi.github.io;", "dblp": ";;;230/4386;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;joR1Z4UAAAAJ;", "orcid": ";;;0000-0002-9075-3705;", "linkedin": ";;;guanya-shi-b07b43126/;", "or_profile": "~Kevin_Huang2;~Rwik_Rana1;~Alexander_Spitzer2;~Guanya_Shi1;~Byron_Boots1", "aff": "University of Washington;;University of Washington;University of Washington;", "aff_domain": "cs.washington.edu;;uw.edu;uw.edu;", "position": "PhD student;;Postdoc;Postdoc;", "bibtex": "@inproceedings{\nhuang2023datt,\ntitle={{DATT}: Deep Adaptive Trajectory Tracking for Quadrotor Control},\nauthor={Kevin Huang and Rwik Rana and Alexander Spitzer and Guanya Shi and Byron Boots},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=XEw-cnNsr6}\n}", "github": "https://github.com/KevinHuang8/DATT", "project": "", "reviewers": "Tsi5;cDww;fQDU;Bvch", "site": "https://openreview.net/forum?id=XEw-cnNsr6", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;4;5;5", "rating_avg": 8.0, "confidence_avg": 4.5, "replies_avg": 16, "authors#_avg": 5, "corr_rating_confidence": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2365868131551598941&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "XMQgwiJ7KSX", "title": "RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control", "track": "main", "status": "Poster", "tldr": "Vision-language models, trained on Internet-scale data, can be incorporated directly into end-to-end robotic control to boost generalization and enable emergent semantic reasoning.", "abstract": "We study how vision-language models trained on Internet-scale data can be incorporated directly into end-to-end robotic control to boost generalization and enable emergent semantic reasoning. Our goal is to enable a single end-to-end trained model to both learn to map robot observations to actions and enjoy the benefits of large-scale pretraining on language and vision-language data from the web. To this end, we propose to co-fine-tune state-of-the-art vision-language models on both robotic trajectory data and Internet-scale vision-language tasks, such as visual question answering. In contrast to other approaches, we propose a simple, general recipe to achieve this goal: in order to fit both natural language responses and robotic actions into the same format, we express the actions as text tokens and incorporate them directly into the training set of the model in the same way as natural language tokens. We refer to such category of models as vision-language-action models (VLA) and instantiate an example of such a model, which we call RT-2. Our extensive evaluation (6k evaluation trials) shows that our approach leads to performant robotic policies and enables RT-2 to obtain a range of emergent capabilities from Internet-scale training. This includes significantly improved generalization to novel objects, the ability to interpret commands not present in the robot training data (such as placing an object onto a particular number or icon), and the ability to perform rudimentary reasoning in response to user commands (such as picking up the smallest or largest object, or the one closest to another object). We further show that incorporating chain of thought reasoning allows RT-2 to perform multi-stage semantic reasoning, for example figuring out which object to pick up for use as an improvised hammer (a rock), or which type of drink is best suited for someone who is tired (an energy drink).", "keywords": "vision-language models;robot manipulation;generalization", "primary_area": "", "supplementary_material": "/attachment/23f53dd890d011f81f3f5d55eafc68cf0e926dc2.zip", "author": "Brianna Zitkovich;Tianhe Yu;Sichun Xu;Peng Xu;Ted Xiao;Fei Xia;Jialin Wu;Paul Wohlhart;Stefan Welker;Ayzaan Wahid;Quan Vuong;Vincent Vanhoucke;Huong Tran;Radu Soricut;Anikait Singh;Jaspiar Singh;Pierre Sermanet;Pannag R Sanketi;Grecia Salazar;Michael S Ryoo;Krista Reymann;Kanishka Rao;Karl Pertsch;Igor Mordatch;Henryk Michalewski;Yao Lu;Sergey Levine;Lisa Lee;Tsang-Wei Edward Lee;Isabel Leal;Yuheng Kuang;Dmitry Kalashnikov;Ryan Julian;Nikhil J Joshi;Alex Irpan;brian ichter;Jasmine Hsu;Alexander Herzog;Karol Hausman;Keerthana Gopalakrishnan;Chuyuan Fu;Pete Florence;Chelsea Finn;Kumar Avinava Dubey;Danny Driess;Tianli Ding;Krzysztof Marcin Choromanski;Xi Chen;Yevgen Chebotar;Justice Carbajal;Noah Brown;Anthony Brohan;Montserrat Gonzalez Arenas;Kehang Han", "authorids": "zitkovich@google.com;~Tianhe_Yu1;sicxu@google.com;~Peng_Xu9;~Ted_Xiao1;~Fei_Xia1;~Jialin_Wu1;~Paul_Wohlhart1;~Stefan_Welker1;~Ayzaan_Wahid1;~Quan_Vuong2;~Vincent_Vanhoucke1;huongtt@google.com;~Radu_Soricut2;~Anikait_Singh1;jaspiar@google.com;~Pierre_Sermanet1;~Pannag_R_Sanketi1;grecias@google.com;~Michael_S_Ryoo1;reymann@google.com;~Kanishka_Rao1;~Karl_Pertsch1;~Igor_Mordatch4;~Henryk_Michalewski1;~Yao_Lu13;~Sergey_Levine1;~Lisa_Lee1;~Tsang-Wei_Edward_Lee1;isabelleal@google.com;yuheng@google.com;~Dmitry_Kalashnikov1;~Ryan_Julian2;~Nikhil_J_Joshi1;~Alex_Irpan1;~brian_ichter1;hellojas@google.com;~Alexander_Herzog2;~Karol_Hausman2;~Keerthana_Gopalakrishnan1;~Chuyuan_Fu1;~Pete_Florence1;~Chelsea_Finn1;~Kumar_Avinava_Dubey1;~Danny_Driess1;~Tianli_Ding1;~Krzysztof_Marcin_Choromanski1;~Xi_Chen23;~Yevgen_Chebotar1;jucarbajal@google.com;noahbrown@google.com;~Anthony_Brohan1;~Montserrat_Gonzalez_Arenas1;~Kehang_Han1", "gender": ";M;;M;M;M;M;M;Not Specified;M;M;M;;M;M;;;M;;M;;;;;M;;M;;M;;;;M;M;M;;;M;;F;F;;F;;;M;;;M;;;M;F;", "homepage": ";https://cs.stanford.edu/~tianheyu/;;;https://www.tedxiao.me;;https://jialinwu.netlify.app/;;;https://ayzaan.com;https://quanvuong.github.io;http://vincent.vanhoucke.com;;;https://asap7772.github.io/;;https://sermanet.github.io/;;;http://michaelryoo.com/;;https://research.google/people/KanishkaRao/;https://kpertsch.github.io/;;https://www.mimuw.edu.pl/~henrykm/;;https://people.eecs.berkeley.edu/~svlevine/;;;;;;https://ryanjulian.me;;http://www.alexirpan.com;;;;;https://keerthanapg.com;;http://www.peteflorence.com/;https://ai.stanford.edu/~cbfinn/;;https://dannydriess.github.io/;;;https://xchen147.github.io/;;;;;;", "dblp": ";192/1797;;;198/0598;;149/5889;http://dblp.uni-trier.de/pers/hd/w/Wohlhart:Paul;;;;69/7157;;;302/3876;;28/6457;;;r/MichaelSRyoo;;;211/7137;;https://dblp.uni-trier.de/pers/hd/m/Michalewski:Henryk;26/5662-6;80/7594;;236/6317.html;;;222/2882;227/2645;;202/2063;;;;;;;;131/1783;;;330/9157;78/11411;16/3283;01/11424;;;;;198/6522", "google_scholar": ";;;460NWeQAAAAJ;;pqP5_PgAAAAJ;M7EpKqsAAAAJ;SzHPa90AAAAJ;sI0DsP8AAAAJ;;NSWI3OwAAAAJ;T7uctwYAAAAJ;;NAzD9mgAAAAJ;lPaISmIAAAAJ;;0nPi5YYAAAAJ;GuU6oA4AAAAJ;;vcw0TJIAAAAJ;;;https://scholar.google.com/citations?view_op=list_works;;YdHW1ycAAAAJ;OI7zFmwAAAAJ;8R35rCwAAAAJ;;;;;;8C2_ZVsAAAAJ;XzETn4QAAAAJ;;-w5DuHgAAAAJ;;jrfFYAIAAAAJ;;;bDq7MZMAAAAJ;;vfPE6hgAAAAJ;;https://scholar.google.de/citations?user=wxnzyjwAAAAJ;;;https://scholar.google.com/citations?hl=en;ADkiClQAAAAJ;;;;;", "orcid": ";;;;;0000-0003-4343-1444;;;;;;0000-0003-0544-2791;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;", "linkedin": ";;;;;;jialin-wu-a50135175/;;;;;vanhoucke;;;asap7772/;;sermanet/;;;;;;;;henryk-michalewski-8a230a27/;;;;;;;;;nikhil-j-joshi;;;;alexander-herzog-154030a5/;;;;;;;;tianli-ding/;;;;;;anthony-brohan-99782b36;montse90/;", "or_profile": "zitkovich@google.com;~Tianhe_Yu1;sicxu@google.com;~Peng_Xu9;~Ted_Xiao1;~Fei_Xia1;~Jialin_Wu1;~Paul_Wohlhart1;~Stefan_Welker1;~Ayzaan_Wahid1;~Quan_Vuong2;~Vincent_Vanhoucke1;huongtt@google.com;~Radu_Soricut2;~Anikait_Singh1;jaspiar@google.com;~Pierre_Sermanet1;~Pannag_R_Sanketi1;grecias@google.com;~Michael_S_Ryoo1;reymann@google.com;~Kanishka_Rao1;~Karl_Pertsch1;~Igor_Mordatch4;~Henryk_Michalewski1;~Yao_Lu13;~Sergey_Levine1;~Lisa_Lee1;~Tsang-Wei_Edward_Lee1;isabelleal@google.com;yuheng@google.com;~Dmitry_Kalashnikov1;~Ryan_Julian2;~Nikhil_J_Joshi1;~Alex_Irpan1;~brian_ichter1;hellojas@google.com;~Alexander_Herzog2;~Karol_Hausman2;~Keerthana_Gopalakrishnan1;~Chuyuan_Fu1;~Pete_Florence1;~Chelsea_Finn1;~Kumar_Avinava_Dubey1;~Danny_Driess1;~Tianli_Ding1;~Krzysztof_Marcin_Choromanski1;~Xi_Chen23;~Yevgen_Chebotar1;jucarbajal@google.com;noahbrown@google.com;~Anthony_Brohan1;~Montserrat_Gonzalez_Arenas1;~Kehang_Han1", "aff": ";Google Brain;;Google;;Google;Google;Graz University of Technology;;Robotics at Google;;Google;;Google;University of California, Berkeley;;Google;Google;;Google DeepMind;;;University of Southern California;;Google DeepMind;Google;Google;;;;;Google;Google;Research, Google;Google DeepMind;Google;;Google;;Research, Google;Google;Google;Google;;Technische Universit\u00e4t Berlin;Google;Google Brain Robotics & Columbia University;Google;Google;;;;;Google DeepMind", "aff_domain": ";google.com;;google.com;;google.com;google.com; ;;google.com;;google.com;;google.com;berkeley.edu;;google.com;google.com;;google.com;;;usc.edu;;google.com;google.com;google.com;;;;;google.com;google.com;research.google.com;google.com;google.com;;google.com;;research.google.com;google.com;google.com;google.com;;tu-berlin.de;google.com;columbia.edu;google.com;google.com;;;;;google.com", "position": ";Research Scientist;;Researcher;;Researcher;Researcher;Post Doc;;Software Engineer;;Principal Scientist;;Research Scientist;Undergrad student;;Research Scientist;Researcher;;Research Scientist;;;PhD student;;Researcher;Researcher;Research Scientist;;;;;Researcher;Senior Research Software Engineer;Researcher;Researcher;Research Scientist;;Researcher;;Researcher;software engineer;Research Scientist;Research Scientist;;PhD student;Researcher;research scientist & adjunct assistant professor;Researcher;Research Scientist;;;;;Researcher", "bibtex": "@inproceedings{\nzitkovich2023rt,\ntitle={{RT}-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control},\nauthor={Brianna Zitkovich and Tianhe Yu and Sichun Xu and Peng Xu and Ted Xiao and Fei Xia and Jialin Wu and Paul Wohlhart and Stefan Welker and Ayzaan Wahid and Quan Vuong and Vincent Vanhoucke and Huong Tran and Radu Soricut and Anikait Singh and Jaspiar Singh and Pierre Sermanet and Pannag R Sanketi and Grecia Salazar and Michael S Ryoo and Krista Reymann and Kanishka Rao and Karl Pertsch and Igor Mordatch and Henryk Michalewski and Yao Lu and Sergey Levine and Lisa Lee and Tsang-Wei Edward Lee and Isabel Leal and Yuheng Kuang and Dmitry Kalashnikov and Ryan Julian and Nikhil J Joshi and Alex Irpan and brian ichter and Jasmine Hsu and Alexander Herzog and Karol Hausman and Keerthana Gopalakrishnan and Chuyuan Fu and Pete Florence and Chelsea Finn and Kumar Avinava Dubey and Danny Driess and Tianli Ding and Krzysztof Marcin Choromanski and Xi Chen and Yevgen Chebotar and Justice Carbajal and Noah Brown and Anthony Brohan and Montserrat Gonzalez Arenas and Kehang Han},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=XMQgwiJ7KSX}\n}", "github": "", "project": "", "reviewers": "N1nv;YNc7;Bmbe;UgBu", "site": "https://openreview.net/forum?id=XMQgwiJ7KSX", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;3;4;5", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 9, "authors#_avg": 54, "corr_rating_confidence": 0.0, "gs_citation": 1068, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4360518173578415769&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1;0;0;0;2;0;0;0;3;0;0;0;0;0;0;0;0;0;0;0;0;0;4;0;0;0;0;0", "aff_unique_norm": "Google;Graz University of Technology;University of California, Berkeley;University of Southern California;Technische Universit\u00e4t Berlin", "aff_unique_dep": "Google Brain;;;;", "aff_unique_url": "https://brain.google.com;https://www.tugraz.at;https://www.berkeley.edu;https://www.usc.edu;https://www.tu-berlin.de", "aff_unique_abbr": "Google Brain;TUGraz;UC Berkeley;USC;TU Berlin", "aff_campus_unique_index": "0;0;0;0;0;0;0;2;0;0;3;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Mountain View;;Berkeley;Los Angeles", "aff_country_unique_index": "0;0;0;0;1;0;0;0;0;0;0;2;0;2;0;0;0;0;0;2;0;0;0;0;0;0;3;0;0;0;0;2", "aff_country_unique": "United States;Austria;United Kingdom;Germany" }, { "id": "XsWGVbPfB4Z", "title": "4D-Former: Multimodal 4D Panoptic Segmentation", "track": "main", "status": "Poster", "tldr": "We propose 4D-Former: a novel method for 4D panoptic segmentation which leverages both LiDAR and image modalities, and predicts semantic masks as well as temporally consistent object masks for the input point-cloud sequence.", "abstract": "4D panoptic segmentation is a challenging but practically useful task that requires every point in a LiDAR point-cloud sequence to be assigned a semantic class label, and individual objects to be segmented and tracked over time. Existing approaches utilize only LiDAR inputs which convey limited information in regions with point sparsity. This problem can, however, be mitigated by utilizing RGB camera images which offer appearance-based information that can reinforce the geometry-based LiDAR features. Motivated by this, we propose 4D-Former: a novel method for 4D panoptic segmentation which leverages both LiDAR and image modalities, and predicts semantic masks as well as temporally consistent object masks for the input point-cloud sequence. We encode semantic classes and objects using a set of concise queries which absorb feature information from both data modalities. Additionally, we propose a learned mechanism to associate object tracks over time which reasons over both appearance and spatial location. We apply 4D-Former to the nuScenes and SemanticKITTI datasets where it achieves state-of-the-art results.", "keywords": "Panoptic Segmentation;Sensor Fusion;Temporal Reasoning;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/5f2a46f597efc73c56c3f0b0fc6cee212d8787b6.zip", "author": "Ali Athar;Enxu Li;Sergio Casas;Raquel Urtasun", "authorids": "~Ali_Athar1;~Enxu_Li1;~Sergio_Casas2;~Raquel_Urtasun1", "gender": "M;;F;M", "homepage": "https://www.aliathar.net/;https://www.cs.toronto.edu/~tli/;http://www.cs.toronto.edu/~urtasun/;http://www.cs.toronto.edu/~sergio/", "dblp": "187/5650;285/4934;u/RaquelUrtasun;46/6535-2", "google_scholar": "mexenQMAAAAJ;Bk4LuGYAAAAJ;https://scholar.google.ca/citations?user=jyxO2akAAAAJ;Vgo1x9YAAAAJ", "orcid": "0000-0001-6807-3925;;;", "linkedin": "aliathar94/;thomas-enxu-li/;;sergio-casas/", "or_profile": "~Ali_Athar1;~Enxu_Li1;~Raquel_Urtasun1;~Sergio_Casas_Romero1", "aff": "Waabi Innovation;Waabi;Department of Computer Science, University of Toronto;University of Toronto", "aff_domain": "waabi.ai;waabi.ai;cs.toronto.edu;toronto.edu", "position": "Intern;Researcher;Full Professor;PhD student", "bibtex": "@inproceedings{\nathar2023dformer,\ntitle={4D-Former: Multimodal 4D Panoptic Segmentation},\nauthor={Ali Athar and Enxu Li and Sergio Casas and Raquel Urtasun},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=XsWGVbPfB4Z}\n}", "github": "", "project": "", "reviewers": "g5r2;56js;oeTB;WZbn", "site": "https://openreview.net/forum?id=XsWGVbPfB4Z", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;5;4;5", "rating_avg": 6.5, "confidence_avg": 4.75, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": 0.13245323570650439, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10776151581148110301&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Waabi Innovation;Waabi;University of Toronto", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "https://waabi.ai;;https://www.utoronto.ca", "aff_unique_abbr": ";;U of T", "aff_campus_unique_index": "1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada;" }, { "id": "ZFjgfJb_5c", "title": "Embodied Lifelong Learning for Task and Motion Planning", "track": "main", "status": "Poster", "tldr": "We formulate a realistic variant of the problem of lifelong learning for TAMP, and devise a mixture of generative models for generating samples for efficient planning", "abstract": "A robot deployed in a home over long stretches of time faces a true lifelong learning problem. As it seeks to provide assistance to its users, the robot should leverage any accumulated experience to improve its own knowledge and proficiency. We formalize this setting with a novel formulation of lifelong learning for task and motion planning (TAMP), which endows our learner with the compositionality of TAMP systems. Exploiting the modularity of TAMP, we develop a mixture of generative models that produces candidate continuous parameters for a planner. Whereas most existing lifelong learning approaches determine a priori how data is shared across various models, our approach learns shared and non-shared models and determines which to use online during planning based on auxiliary tasks that serve as a proxy for each model's understanding of a state. Our method exhibits substantial improvements (over time and compared to baselines) in planning success on 2D and BEHAVIOR domains.", "keywords": "task and motion planning;lifelong learning;generative models", "primary_area": "", "supplementary_material": "/attachment/4e9e040278811b28de2d7a27402adbbbe7af1f20.zip", "author": "Jorge Mendez-Mendez;Leslie Pack Kaelbling;Tom\u00e1s Lozano-P\u00e9rez", "authorids": "~Jorge_Mendez-Mendez1;~Leslie_Pack_Kaelbling1;~Tom\u00e1s_Lozano-P\u00e9rez1", "gender": "F;M;M", "homepage": "http://people.csail.mit.edu/lpk/;http://people.csail.mit.edu/tlp/;https://www.seas.upenn.edu/~mendezme/", "dblp": "k/LesliePackKaelbling;90/752;255/6609", "google_scholar": "IcasIiwAAAAJ;gQOKAggAAAAJ;87sQtnsAAAAJ", "orcid": "0000-0001-6054-7145;;0000-0002-2537-598X", "linkedin": ";;", "or_profile": "~Leslie_Pack_Kaelbling1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Jorge_Armando_Mendez_Mendez1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu", "position": "Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\nmendez-mendez2023embodied,\ntitle={Embodied Lifelong Learning for Task and Motion Planning},\nauthor={Jorge Mendez-Mendez and Leslie Pack Kaelbling and Tom{\\'a}s Lozano-P{\\'e}rez},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ZFjgfJb_5c}\n}", "github": "", "project": "", "reviewers": "8dBQ;VjVf;2eBC;h46E", "site": "https://openreview.net/forum?id=ZFjgfJb_5c", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1418632905553136674&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "_A15qsPswaK", "title": "HYDRA: Hybrid Robot Actions for Imitation Learning", "track": "main", "status": "Poster", "tldr": "We learn a hybrid robot action space that dynamically switches between low-level actions and high-level waypoints, and our method substantially outperforms baselines on a variety of long horizon real world tasks like making coffee and sorting dishes.", "abstract": "Imitation Learning (IL) is a sample efficient paradigm for robot learning using expert demonstrations. However, policies learned through IL suffer from state distribution shift at test time, due to compounding errors in action prediction which lead to previously unseen states. Choosing an action representation for the policy that minimizes this distribution shift is critical in imitation learning. Prior work propose using temporal action abstractions to reduce compounding errors, but they often sacrifice policy dexterity or require domain-specific knowledge. To address these trade-offs, we introduce HYDRA, a method that leverages a hybrid action space with two levels of action abstractions: sparse high-level waypoints and dense low-level actions. HYDRA dynamically switches between action abstractions at test time to enable both coarse and fine-grained control of a robot. In addition, HYDRA employs action relabeling to increase the consistency of actions in the dataset, further reducing distribution shift. HYDRA outperforms prior imitation learning methods by 30-40% on seven challenging simulation and real world environments, involving long-horizon tasks in the real world like making coffee and toasting bread. Videos are found on our website: https://tinyurl.com/3mc6793z", "keywords": "Imitation Learning;Robotics;Manipulation", "primary_area": "", "supplementary_material": "/attachment/1833bdf94c65a9083a2db236ae17dbae64b8337f.zip", "author": "Suneel Belkhale;Yuchen Cui;Dorsa Sadigh", "authorids": "~Suneel_Belkhale1;~Yuchen_Cui1;~Dorsa_Sadigh1", "gender": "M;F;F", "homepage": "https://github.com/suneelbelkhale;https://yuchencui.cc;https://dorsa.fyi/", "dblp": "236/5069;201/5416.html;117/3174", "google_scholar": ";qQz2cm8AAAAJ;ZaJEZpYAAAAJ", "orcid": "0000-0002-3963-7987;0000-0001-7417-1222;", "linkedin": "suneel-b-032b1a101/;;", "or_profile": "~Suneel_Belkhale1;~Yuchen_Cui1;~Dorsa_Sadigh1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nbelkhale2023hydra,\ntitle={{HYDRA}: Hybrid Robot Actions for Imitation Learning},\nauthor={Suneel Belkhale and Yuchen Cui and Dorsa Sadigh},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=_A15qsPswaK}\n}", "github": "https://sites.google.com/corp/view/hydra-il-2023", "project": "", "reviewers": "QMAq;gKhQ;1bNM;DQGz", "site": "https://openreview.net/forum?id=_A15qsPswaK", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;4", "rating_avg": 6.5, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 40, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1980458046389361241&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "_DYsYC9smK", "title": "DYNAMO-GRASP: DYNAMics-aware Optimization for GRASP Point Detection in Suction Grippers", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this research, we introduce a novel approach to the challenge of suction grasp point detection. Our method, exploiting the strengths of physics-based simulation and data-driven modeling, accounts for object dynamics during the grasping process, markedly enhancing the robot's capability to handle previously unseen objects and scenarios in real-world settings. We benchmark DYNAMO-GRASP against established approaches via comprehensive evaluations in both simulated and real-world environments. DYNAMO-GRASP delivers improved grasping performance with greater consistency in both simulated and real-world settings. Remarkably, in real-world tests with challenging scenarios, our method demonstrates a success rate improvement of up to 48\\% over SOTA methods. Demonstrating a strong ability to adapt to complex and unexpected object dynamics, our method offers robust generalization to real-world challenges. The results of this research set the stage for more reliable and resilient robotic manipulation in intricate real-world situations. Experiment videos, dataset, model, and code are available at: https://sites.google.com/view/dynamo-grasp.", "keywords": "Suction Grasping;Manipulation;Deep Learning;Vision", "primary_area": "", "supplementary_material": "", "author": "Boling Yang;Soofiyan Atar;Markus Grotz;Byron Boots;Joshua Smith", "authorids": "~Boling_Yang1;~Soofiyan_Atar1;~Markus_Grotz1;~Byron_Boots1;~Joshua_Smith2", "gender": "M;;M;;M", "homepage": "https://homes.cs.washington.edu/~bolingy/;;;;http://sensor.cs.washington.edu", "dblp": "203/5117.html;;173/7849;;s/JoshuaRSmith.html", "google_scholar": "sw__JwIAAAAJ;;https://scholar.google.de/citations?user=ywTBxOkAAAAJ;;LnAus20AAAAJ", "orcid": "0000-0002-6211-122X;;0000-0001-7257-5872;;0000-0002-5331-4770", "linkedin": "boling-yang-104534123/;;markus-grotz-75b55ab4/;;joshua-smith-b8a0b61/", "or_profile": "~Boling_Yang1;~Soofiyan_Atar1;~Markus_Grotz1;~Byron_Boots1;~Joshua_Smith2", "aff": "Department of Computer Science, University of Washington;;University of Washington;;University of Washington", "aff_domain": "cs.washington.edu;;uw.edu;;cs.washington.edu", "position": "Graduate Research Assistant;;Postdoc;;Full Professor", "bibtex": "@inproceedings{\nyang2023dynamograsp,\ntitle={{DYNAMO}-{GRASP}: {DYNAM}ics-aware Optimization for {GRASP} Point Detection in Suction Grippers},\nauthor={Boling Yang and Soofiyan Atar and Markus Grotz and Byron Boots and Joshua Smith},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=_DYsYC9smK}\n}", "github": "https://github.com/dynamo-grasp/dynamo-grasp", "project": "", "reviewers": "gTny;dsfY;GYg8;TQDd", "site": "https://openreview.net/forum?id=_DYsYC9smK", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;4;3;5", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12838909816884705762&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "_gZLyRGGuo", "title": "Learning Efficient Abstract Planning Models that Choose What to Predict", "track": "main", "status": "Poster", "tldr": "We introduce a new method for learning symbolic operators conducive to efficient task and motion planning because they only make predictions necessary for high-level planning.", "abstract": "An effective approach to solving long-horizon tasks in robotics domains with continuous state and action spaces is bilevel planning, wherein a high-level search over an abstraction of an environment is used to guide low-level decision-making. Recent work has shown how to enable such bilevel planning by learning abstract models in the form of symbolic operators and neural samplers. In this work, we show that existing symbolic operator learning approaches fall short in many robotics domains where a robot's actions tend to cause a large number of irrelevant changes in the abstract state. This is primarily because they attempt to learn operators that exactly predict all observed changes in the abstract state. To overcome this issue, we propose to learn operators that `choose what to predict' by only modelling changes necessary for abstract planning to achieve specified goals. Experimentally, we show that our approach learns operators that lead to efficient planning across 10 different hybrid robotics domains, including 4 from the challenging BEHAVIOR-100 benchmark, while generalizing to novel initial states, goals, and objects.", "keywords": "Learning for TAMP;Abstraction Learning;Long-horizon Problems", "primary_area": "", "supplementary_material": "/attachment/99f572d5697ce2a902cc0ab9fbfedab260f74bd4.zip", "author": "Nishanth Kumar;Willie McClinton;Rohan Chitnis;Tom Silver;Tom\u00e1s Lozano-P\u00e9rez;Leslie Pack Kaelbling", "authorids": "~Nishanth_Kumar1;~Willie_McClinton1;~Rohan_Chitnis1;~Tom_Silver1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "gender": "M;;M;M;M;F", "homepage": "http://nishanthjkumar.com/;https://wmcclinton.github.io/;https://rohanchitnis.com;https://web.mit.edu/tslvr/www/;http://people.csail.mit.edu/tlp/;http://people.csail.mit.edu/lpk/", "dblp": "211/7595;;151/9589;202/1778;90/752;k/LesliePackKaelbling", "google_scholar": "FE512o4AAAAJ;nwefjOEAAAAJ;rNcmwggAAAAJ;CMcsygMAAAAJ;gQOKAggAAAAJ;IcasIiwAAAAJ", "orcid": "0000-0001-9291-3728;;;;;0000-0001-6054-7145", "linkedin": "nishanth-kumar;;;;;", "or_profile": "~Nishanth_Kumar1;~Willie_McClinton1;~Rohan_Chitnis1;~Tom_Silver1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "aff": "The AI Institute;Massachusetts Institute of Technology;Meta;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "theaiinstitute.com;mit.edu;meta.com;mit.edu;mit.edu;mit.edu", "position": "Intern;PhD student;Researcher;PhD student;Full Professor;Full Professor", "bibtex": "@inproceedings{\nkumar2023learning,\ntitle={Learning Efficient Abstract Planning Models that Choose What to Predict},\nauthor={Nishanth Kumar and Willie McClinton and Rohan Chitnis and Tom Silver and Tom{\\'a}s Lozano-P{\\'e}rez and Leslie Pack Kaelbling},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=_gZLyRGGuo}\n}", "github": "https://github.com/Learning-and-Intelligent-Systems/predicators_behavior/releases/tag/corl-23-submission", "project": "", "reviewers": "uCVm;wErB;sXyD;i5Gu", "site": "https://openreview.net/forum?id=_gZLyRGGuo", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "5;3;4;2", "rating_avg": 8.0, "confidence_avg": 3.5, "replies_avg": 9, "authors#_avg": 6, "corr_rating_confidence": -0.4472135954999579, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12282877692138556166&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;1;1;1", "aff_unique_norm": "AI Institute;Massachusetts Institute of Technology;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": ";https://web.mit.edu;https://meta.com", "aff_unique_abbr": ";MIT;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "_xFJuqBId8c", "title": "Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement", "track": "main", "status": "Poster", "tldr": "A method for rearranging objects in scenes that present multi-modal placement locations via iterative pose de-noising on object-scene point clouds.", "abstract": "We propose a system for rearranging objects in a scene to achieve a desired object-scene placing relationship, such as a book inserted in an open slot of a bookshelf. The pipeline generalizes to novel geometries, poses, and layouts of both scenes and objects, and is trained from demonstrations to operate directly on 3D point clouds. Our system overcomes challenges associated with the existence of many geometrically-similar rearrangement solutions for a given scene. By leveraging an iterative pose de-noising training procedure, we can fit multi-modal demonstration data and produce multi-modal outputs while remaining precise and accurate. We also show the advantages of conditioning on relevant local geometric features while ignoring irrelevant global structure that harms both generalization and precision. We demonstrate our approach on three distinct rearrangement tasks that require handling multi-modality and generalization over object shape and pose in both simulation and the real world. Project website, code, and videos: https://anthonysimeonov.github.io/rpdiff-multi-modal", "keywords": "Object Rearrangement;Multi-modality;Manipulation;Point Clouds;Relations;Diffusion", "primary_area": "", "supplementary_material": "/attachment/479ed7c368f7b972e26186bec2b85f12ac88d8a4.zip", "author": "Anthony Simeonov;Ankit Goyal;Lucas Manuelli;Yen-Chen Lin;Alina Sarmiento;Alberto Rodriguez Garcia;Pulkit Agrawal;Dieter Fox", "authorids": "~Anthony_Simeonov1;~Ankit_Goyal1;~Lucas_Manuelli1;~Yen-Chen_Lin1;alinasar@mit.edu;~Alberto_Rodriguez_Garcia1;~Pulkit_Agrawal1;~Dieter_Fox1", "gender": ";M;M;M;;M;M;M", "homepage": "https://anthonysimeonov.github.io/;http://imankgoyal.github.io/;http://lucasmanuelli.com;http://yenchenlin.me/;;http://mcube.mit.edu/;https://people.eecs.berkeley.edu/~pulkitag/;https://homes.cs.washington.edu/~fox/", "dblp": ";89/10051-1;;180/0954;;;149/2672;f/DieterFox", "google_scholar": ";RhN6jKIAAAAJ;0pxg5ssAAAAJ;RbCKRPcAAAAJ;;AC93g9kAAAAJ;UpZmJI0AAAAJ;DqXsbPAAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Anthony_Simeonov1;~Ankit_Goyal1;~Lucas_Manuelli1;~Yen-Chen_Lin1;alinasar@mit.edu;~Alberto_Rodriguez_Garcia1;~Pulkit_Agrawal1;~Dieter_Fox1", "aff": "Massachusetts Institute of Technology;NVIDIA;Boston Dynamics;Massachusetts Institute of Technology;;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Department of Computer Science", "aff_domain": "mit.edu;nvidia.com;bostondynamics.com;mit.edu;;mit.edu;mit.edu;cs.washington.edu", "position": "PhD student;Researcher;Researcher;PhD student;;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nsimeonov2023shelving,\ntitle={Shelving, Stacking, Hanging: Relational Pose Diffusion for Multi-modal Rearrangement},\nauthor={Anthony Simeonov and Ankit Goyal and Lucas Manuelli and Yen-Chen Lin and Alina Sarmiento and Alberto Rodriguez Garcia and Pulkit Agrawal and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=_xFJuqBId8c}\n}", "github": "https://github.com/anthonysimeonov/rpdiff", "project": "", "reviewers": "SEr7;HwxF;AZ9E;ZYnf;vuUj", "site": "https://openreview.net/forum?id=_xFJuqBId8c", "pdf_size": 0, "rating": "6;6;6;6;10", "confidence": "3;3;2;4;2", "rating_avg": 6.8, "confidence_avg": 2.8, "replies_avg": 23, "authors#_avg": 8, "corr_rating_confidence": -0.5345224838248488, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8551794841260142929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;0;0;0;3", "aff_unique_norm": "Massachusetts Institute of Technology;NVIDIA;Boston Dynamics;Unknown Institution", "aff_unique_dep": ";NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://web.mit.edu;https://www.nvidia.com;https://www.bostondynamics.com;", "aff_unique_abbr": "MIT;NVIDIA;BD;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "a0mFRgadGO", "title": "Bootstrap Your Own Skills: Learning to Solve New Tasks with Large Language Model Guidance", "track": "main", "status": "Oral", "tldr": "We propose BOSS, an approach that automatically learns to solve new long-horizon tasks by growing a learned skill library via LLM-guided exploration.", "abstract": "We propose BOSS, an approach that automatically learns to solve new long-horizon, complex, and meaningful tasks by growing a learned skill library with minimal supervision. Prior work in reinforcement learning require expert supervision, in the form of demonstrations or rich reward functions, to learn long-horizon tasks. Instead, our approach BOSS (BOotStrapping your own Skills) learns to accomplish new tasks by performing \"skill bootstrapping,\" where an agent with a set of primitive skills interacts with the environment to practice new skills without receiving reward feedback for tasks outside of the initial skill set. This bootstrapping phase is guided by large language models (LLMs) that inform the agent of meaningful skills to chain together. Through this process, BOSS builds a wide range of complex and useful behaviors from a basic set of primitive skills. We demonstrate through experiments in realistic household environments that agents trained with our LLM-guided bootstrapping procedure outperform those trained with naive bootstrapping as well as prior unsupervised skill acquisition methods on zero-shot execution of unseen, long-horizon tasks in new environments. Website at clvrai.com/boss.", "keywords": "Reinforcement Learning;Skill Learning;Large Language Models", "primary_area": "", "supplementary_material": "/attachment/a3dd4f623c6e577c424e9436f7a8b86398092197.zip", "author": "Jesse Zhang;Jiahui Zhang;Karl Pertsch;Ziyi Liu;Xiang Ren;Minsuk Chang;Shao-Hua Sun;Joseph J Lim", "authorids": "~Jesse_Zhang3;~Jiahui_Zhang5;~Karl_Pertsch1;~Ziyi_Liu2;~Xiang_Ren1;~Minsuk_Chang1;~Shao-Hua_Sun1;~Joseph_J_Lim1", "gender": "M;;;F;M;M;M;M", "homepage": "https://jessezhang.net;https://jiahui-3205.github.io/;https://kpertsch.github.io/;https://taichi-pink.github.io/Ziyi-Liu/;https://shanzhenren.github.io/;https://minsukchang.com;http://shaohua0116.github.io;http://people.csail.mit.edu/lim/", "dblp": ";;211/7137;;36/360-1;199/2711;158/9680;08/3086", "google_scholar": "fSXCOfEAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?view_op=list_works;;_moJlrIAAAAJ;1j2nBpoAAAAJ;uXsfnaQAAAAJ;jTnQTBoAAAAJ", "orcid": ";;;;;0000-0002-9441-3337;0000-0001-7579-6734;", "linkedin": ";jiahui-zhang-2269451a3/;;;xren7;minsuk/;shaohua0116/;", "or_profile": "~Jesse_Zhang3;~Jiahui_Zhang5;~Karl_Pertsch1;~Ziyi_Liu2;~Xiang_Ren1;~Minsuk_Chang1;~Shao-Hua_Sun1;~Joseph_J_Lim1", "aff": "Amazon;University of Southern California;University of Southern California;University of Southern California;University of Southern California;Research, Google;National Taiwan University;Korea Advanced Institute of Science & Technology", "aff_domain": "amazon.com;usc.edu;usc.edu;usc.edu;usc.edu;research.google.com;ntu.edu.tw;kaist.ac.kr", "position": "Intern;MS student;PhD student;Visiting Scholar;Associate Professor;Research Scientist;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nzhang2023bootstrap,\ntitle={Bootstrap Your Own Skills: Learning to Solve New Tasks with Large Language Model Guidance},\nauthor={Jesse Zhang and Jiahui Zhang and Karl Pertsch and Ziyi Liu and Xiang Ren and Minsuk Chang and Shao-Hua Sun and Joseph J Lim},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=a0mFRgadGO}\n}", "github": "", "project": "", "reviewers": "MjyV;CC9W;e5v5", "site": "https://openreview.net/forum?id=a0mFRgadGO", "pdf_size": 0, "rating": "6;6;10", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 10, "authors#_avg": 8, "corr_rating_confidence": 0.5, "gs_citation": 80, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4989569206184954074&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1;1;2;3;4", "aff_unique_norm": "Amazon;University of Southern California;Google;National Taiwan University;Korea Advanced Institute of Science and Technology", "aff_unique_dep": "Amazon.com, Inc.;;Google Research;;", "aff_unique_url": "https://www.amazon.com;https://www.usc.edu;https://research.google;https://www.ntu.edu.tw;https://www.kaist.ac.kr", "aff_unique_abbr": "Amazon;USC;Google;NTU;KAIST", "aff_campus_unique_index": "1;1;1;1;2;3", "aff_campus_unique": ";Los Angeles;Mountain View;Taiwan", "aff_country_unique_index": "0;0;0;0;0;0;1;2", "aff_country_unique": "United States;China;South Korea" }, { "id": "afF8RGcBBP", "title": "PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play", "track": "main", "status": "Poster", "tldr": "We present a novel skill learning approach that leverages discrete diffusion models, to learn from language annotated play data.", "abstract": "Learning from unstructured and uncurated data has become the dominant paradigm for generative approaches in language or vision. Such unstructured and unguided behavior data, commonly known as play, is also easier to collect in robotics but much more difficult to learn from due to its inherently multimodal, noisy, and suboptimal nature. In this paper, we study this problem of learning goal-directed skill policies from unstructured play data which is labeled with language in hindsight. Specifically, we leverage advances in diffusion models to learn a multi-task diffusion model to extract robotic skills from play data. Using a conditional denoising diffusion process in the space of states and actions, we can gracefully handle the complexity and multimodality of play data and generate diverse and interesting robot behaviors. To make diffusion models more useful for skill learning, we encourage robotic agents to acquire a vocabulary of skills by introducing discrete bottlenecks into the conditional behavior generation process. In our experiments, we demonstrate the effectiveness of our approach across a wide variety of environments in both simulation and the real world. Video results available at https://play-fusion.github.io.", "keywords": "Diffusion Models;Learning from Play;Language-Driven Robotics", "primary_area": "", "supplementary_material": "/attachment/de4bedc3d606c647db33126fdf3e01cbfdddc9b7.zip", "author": "Lili Chen;Shikhar Bahl;Deepak Pathak", "authorids": "~Lili_Chen1;~Shikhar_Bahl1;~Deepak_Pathak1", "gender": ";;M", "homepage": "http://www.lilichen.me;https://www.cs.cmu.edu/~sbahl2/;https://www.cs.cmu.edu/~dpathak/", "dblp": "92/169;223/4390;155/9860", "google_scholar": "https://scholar.google.com/citations?hl=en;bdHgGgEAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;", "linkedin": "lili-chen/;;pathak22/", "or_profile": "~Lili_Chen1;~Shikhar_Bahl1;~Deepak_Pathak1", "aff": "Carnegie Mellon University;Meta Facebook;Carnegie Mellon University", "aff_domain": "cmu.edu;meta.com;cmu.edu", "position": "PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nchen2023playfusion,\ntitle={PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play},\nauthor={Lili Chen and Shikhar Bahl and Deepak Pathak},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=afF8RGcBBP}\n}", "github": "", "project": "", "reviewers": "JiUe;aiX8;FBbz;wanB", "site": "https://openreview.net/forum?id=afF8RGcBBP", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "5;3;4;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 14, "authors#_avg": 3, "corr_rating_confidence": -0.816496580927726, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7679012324551172940&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "CMU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "b-cto-fetlz", "title": "HomeRobot: Open-Vocabulary Mobile Manipulation", "track": "main", "status": "Poster", "tldr": "We propose \"open vocabulary mobile manipulation\" as a key problem for robotics, and provide both a simulation and a reproducible real-world benchmark.", "abstract": "HomeRobot (noun): An affordable compliant robot that navigates homes and manipulates a wide range of objects in order to complete everyday tasks.\n\nOpen-Vocabulary Mobile Manipulation (OVMM) is the problem of picking any object in any unseen environment, and placing it in a commanded location. This is a foundational challenge for robots to be useful assistants in human environments, because it involves tackling sub-problems from across robotics: perception, language understanding, navigation, and manipulation are all essential to OVMM. In addition, integration of the solutions to these sub-problems poses its own substantial challenges. To drive research in this area, we introduce the HomeRobot OVMM benchmark, where an agent navigates household environments to grasp novel objects and place them on target receptacles. HomeRobot has two components: a simulation component, which uses a large and diverse curated object set in new, high-quality multi-room home environments; and a real-world component, providing a software stack for the low-cost Hello Robot Stretch to encourage replication of real-world experiments across labs. We implement both reinforcement learning and heuristic (model-based) baselines and show evidence of sim-to-real transfer. Our baselines achieve a 20% success rate in the real world; our experiments identify ways future research work improve performance. See videos on our website: https://home-robot-ovmm.github.io/.", "keywords": "benchmark;mobile manipulation;sim2real", "primary_area": "", "supplementary_material": "/attachment/be00cd3d10d662e346d9aaf3552c403572778a51.zip", "author": "Sriram Yenamandra;Arun Ramachandran;Karmesh Yadav;Austin S Wang;Mukul Khanna;Theophile Gervet;Tsung-Yen Yang;Vidhi Jain;Alexander Clegg;John M Turner;Zsolt Kira;Manolis Savva;Angel X Chang;Devendra Singh Chaplot;Dhruv Batra;Roozbeh Mottaghi;Yonatan Bisk;Chris Paxton", "authorids": "~Sriram_Yenamandra1;~Arun_Ramachandran1;~Karmesh_Yadav1;~Austin_S_Wang1;~Mukul_Khanna1;~Theophile_Gervet1;~Tsung-Yen_Yang2;~Vidhi_Jain2;~Alexander_Clegg1;~John_M_Turner1;~Zsolt_Kira1;~Manolis_Savva1;~Angel_X_Chang1;~Devendra_Singh_Chaplot2;~Dhruv_Batra1;~Roozbeh_Mottaghi1;~Yonatan_Bisk1;~Chris_Paxton1", "gender": "M;;M;M;M;M;;F;M;;M;M;F;Not Specified;;M;M;M", "homepage": ";https://arunram.me;https://www.karmeshyadav.com;;https://mukulkhanna.github.io;https://theophilegervet.github.io;https://sites.google.com/view/tyjimmyyang;http://vidhijain.github.io;;http://johnmturner.com/;https://faculty.cc.gatech.edu/~zk15;http://msavva.github.io/;https://angelxuanchang.github.io;https://dhruvbatra.com;http://roozbehm.info;http://www.YonatanBisk.com;https://cpaxton.github.io/;https://devendrachaplot.github.io/", "dblp": "291/9224;;264/3702;;255/5833;;204/7980;199/2574;165/9761;;36/4127;21/9924;46/10489;67/6586;36/633;38/9282;;161/0038", "google_scholar": ";GdUWd0QKzTQC;VsTyEcQAAAAJ;keDqjK0AAAAJ;kWAlOAkAAAAJ;-o8kQPwAAAAJ;g-hQdY8AAAAJ;;https://scholar.google.com/citations?hl=en;;2a5XgNAAAAAJ;4D2vsdYAAAAJ;8gfs8XIAAAAJ;_bs7PqgAAAAJ;CCV58dgAAAAJ;bWoGh8UAAAAJ;I1mOQpAAAAAJ;1MSpdmQAAAAJ", "orcid": ";;;;;;;;;;0000-0002-2626-2004;0000-0001-6132-8964;0009-0003-5055-6437;;;0000-0002-2111-9081;;", "linkedin": ";aramach;karmesh-yadav/;;mukulkhanna/;theophile-gervet/;tsung-yen-yang;vidhijain96/;alexander-clegg-68336839;;;manolis-savva-39591a2b/;;;roozbeh-mottaghi-63397aa0;yonatanbisk/;;", "or_profile": "~Sriram_Yenamandra1;~Arun_Ramachandran1;~Karmesh_Yadav1;~Austin_S_Wang1;~Mukul_Khanna1;~Theophile_Gervet1;~Tsung-Yen_Yang2;~Vidhi_Jain2;~Alexander_Clegg1;~John_M_Turner1;~Zsolt_Kira1;~Manolis_Savva1;~Angel_X_Chang1;~Dhruv_Batra1;~Roozbeh_Mottaghi1;~Yonatan_Bisk1;~Chris_Paxton1;~Devendra_Chaplot1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Meta AI;Meta Facebook;Georgia Institute of Technology;Carnegie Mellon University;Meta AI;Google;Meta AI;;Georgia Tech Research Institute;Simon Fraser University;Simon Fraser University;Georgia Institute of Technology;University of Washington;Meta;Meta Platforms;Meta Facebook", "aff_domain": "gatech.edu;gatech.edu;meta.com;fb.com;gatech.edu;cmu.edu;meta.com;google.com;meta.com;;gtri.gatech.edu;sfu.ca;sfu.ca;gatech.edu;cs.washington.edu;meta.com;meta.com;fb.com", "position": "MS student;MS student;Researcher;Researcher;MS student;PhD student;Researcher;Student Researcher;Researcher;;Senior Research Scientist;Assistant Professor;Assistant Professor;Associate Professor;Affiliate Professor ;Visiting Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nyenamandra2023homerobot,\ntitle={HomeRobot: Open-Vocabulary Mobile Manipulation},\nauthor={Sriram Yenamandra and Arun Ramachandran and Karmesh Yadav and Austin S Wang and Mukul Khanna and Theophile Gervet and Tsung-Yen Yang and Vidhi Jain and Alexander Clegg and John M Turner and Zsolt Kira and Manolis Savva and Angel X Chang and Devendra Singh Chaplot and Dhruv Batra and Roozbeh Mottaghi and Yonatan Bisk and Chris Paxton},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=b-cto-fetlz}\n}", "github": "https://github.com/facebookresearch/home-robot", "project": "", "reviewers": "Jmt1;CNKn;kzRx;uxxU", "site": "https://openreview.net/forum?id=b-cto-fetlz", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;4;4;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 11, "authors#_avg": 18, "corr_rating_confidence": 0.0, "gs_citation": 98, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9955154956387142167&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1;0;2;1;3;1;4;5;5;0;6;1;1;1", "aff_unique_norm": "Georgia Institute of Technology;Meta;Carnegie Mellon University;Google;Georgia Tech Research Institute;Simon Fraser University;University of Washington", "aff_unique_dep": ";Meta AI;;Google;;;", "aff_unique_url": "https://www.gatech.edu;https://meta.com;https://www.cmu.edu;https://www.google.com;https://www.gtri.gatech.edu;https://www.sfu.ca;https://www.washington.edu", "aff_unique_abbr": "Georgia Tech;Meta;CMU;Google;GTRI;SFU;UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;1;1;0;0;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "b1tl3aOt2R2", "title": "GNFactor: Multi-Task Real Robot Learning with Generalizable Neural Feature Fields", "track": "main", "status": "Oral", "tldr": "", "abstract": "It is a long-standing problem in robotics to develop agents capable of executing diverse manipulation tasks from visual observations in unstructured real-world environments. To achieve this goal, the robot will need to have a comprehensive understanding of the 3D structure and semantics of the scene. In this work, we present $\\textbf{GNFactor}$, a visual behavior cloning agent for multi-task robotic manipulation with $\\textbf{G}$eneralizable $\\textbf{N}$eural feature $\\textbf{F}$ields. GNFactor jointly optimizes a neural radiance field (NeRF) as a reconstruction module and a Perceiver Transformer as a decision-making module, leveraging a shared deep 3D voxel representation. To incorporate semantics in 3D, the reconstruction module incorporates a vision-language foundation model (e.g., Stable Diffusion) to distill rich semantic information into the deep 3D voxel. We evaluate GNFactor on 3 real-robot tasks and perform detailed ablations on 10 RLBench tasks with a limited number of demonstrations. We observe a substantial improvement of GNFactor over current state-of-the-art methods in seen and unseen tasks, demonstrating the strong generalization ability of GNFactor. Project website: https://yanjieze.com/GNFactor/", "keywords": "Robotic Manipulation;Neural Radiance Field;Behavior Cloning", "primary_area": "", "supplementary_material": "/attachment/882b5c47f01509003bbad5f411fdb85597a75f3e.zip", "author": "Yanjie Ze;Ge Yan;Yueh-Hua Wu;Annabella Macaluso;Yuying Ge;Jianglong Ye;Nicklas Hansen;Li Erran Li;Xiaolong Wang", "authorids": "~Yanjie_Ze1;~Ge_Yan3;~Yueh-Hua_Wu1;~Annabella_Macaluso1;~Yuying_Ge2;~Jianglong_Ye1;~Nicklas_Hansen1;~Li_Erran_Li1;~Xiaolong_Wang3", "gender": "M;Not Specified;;F;F;M;Non-Binary;;M", "homepage": "http://yanjieze.com;https://geyan21.github.io/;;https://annabellamacaluso.github.io/;https://geyuying.github.io/;https://jianglongye.com;https://nicklashansen.github.io;http://www.cs.columbia.edu/~lierranli/;https://xiaolonw.github.io/", "dblp": "312/5407;169/8155-6;;;223/4673;307/5025;258/0744.html;l/ErranLLi.html;91/952-4", "google_scholar": "BO_b2O8AAAAJ;ma7qW2kAAAAJ;;yqqESloAAAAJ;hv1LiiEAAAAJ;nkEGpKsAAAAJ;OFtDgzwAAAAJ;GkMfzy4AAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;;0000-0003-1347-9199;0000-0001-9897-4003;;", "linkedin": "yanjie-ze-a71a0a247/;ge-yan/;;annabella-macaluso-93a023192/;;;ncklas;;", "or_profile": "~Yanjie_Ze1;~Ge_Yan3;~Yueh-Hua_Wu1;~Annabella_Macaluso1;~Yuying_Ge2;~Jianglong_Ye1;~Nicklas_Hansen1;~Li_Erran_Li1;~Xiaolong_Wang3", "aff": "Shanghai Jiaotong University;University of California, San Diego;;University of California, San Diego;University of Hong Kong;University of California, San Diego;University of California, San Diego;Columbia University;University of California, San Diego", "aff_domain": "sjtu.edu.cn;ucsd.edu;;ucsd.edu;hku.hk;ucsd.edu;ucsd.edu;columbia.edu;ucsd.edu", "position": "Undergrad student;MS student;;Undergrad student;PhD student;PhD student;PhD student;Adjunct Professor;Assistant Professor", "bibtex": "@inproceedings{\nze2023gnfactor,\ntitle={{GNF}actor: Multi-Task Real Robot Learning with Generalizable Neural Feature Fields},\nauthor={Yanjie Ze and Ge Yan and Yueh-Hua Wu and Annabella Macaluso and Yuying Ge and Jianglong Ye and Nicklas Hansen and Li Erran Li and Xiaolong Wang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=b1tl3aOt2R2}\n}", "github": "https://github.com/YanjieZe/GNFactor", "project": "", "reviewers": "Lzg7;dBGA;asyz;obdr", "site": "https://openreview.net/forum?id=b1tl3aOt2R2", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;5;3;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 22, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6912590777221418045&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;2;1;1;3;1", "aff_unique_norm": "Shanghai Jiao Tong University;University of California, San Diego;University of Hong Kong;Columbia University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ucsd.edu;https://www.hku.hk;https://www.columbia.edu", "aff_unique_abbr": "SJTU;UCSD;HKU;Columbia", "aff_campus_unique_index": "1;1;2;1;1;1", "aff_campus_unique": ";San Diego;Hong Kong SAR", "aff_country_unique_index": "0;1;1;0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "bIvIUNH9VQ", "title": "Hijacking Robot Teams Through Adversarial Communication", "track": "main", "status": "Oral", "tldr": "We contribute a novel black-box adversarial method that learns to hijack robot communication in a multi-agent setting without their ground truth reward or access to their policies", "abstract": "Communication is often necessary for robot teams to collaborate and complete a decentralized task. Multi-agent reinforcement learning (MARL) systems allow agents to learn how to collaborate and communicate to complete a task. These domains are ubiquitous and include safety-critical domains such as wildfire fighting, traffic control, or search and rescue missions. However, critical vulnerabilities may arise in communication systems as jamming the signals can interrupt the robot team. This work presents a framework for applying black-box adversarial attacks to learned MARL policies by manipulating only the communication signals between agents. Our system only requires observations of MARL policies after training is complete, as this is more realistic than attacking the training process. To this end, we imitate a learned policy of the targeted agents without direct interaction with the environment or ground truth rewards. Instead, we infer the rewards by only observing the behavior of the targeted agents. Our framework reduces reward by 201% compared to an equivalent baseline method and also shows favorable results when deployed in real swarm robots. Our novel attack methodology within MARL systems contributes to the field by enhancing our understanding on the reliability of multi-agent systems.", "keywords": "Adversarial Attacks;Multi-Agent Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/88e2a2704c7d309ea2bd1fa2f0e58a02822fb610.zip", "author": "Zixuan Wu;Sean Charles Ye;Byeolyi Han;Matthew Gombolay", "authorids": "~Zixuan_Wu2;~Sean_Charles_Ye1;~Byeolyi_Han1;~Matthew_Gombolay1", "gender": "M;M;;M", "homepage": "http://linkedin.com/in/zixuan-w-3b81a0152;;https://www.linkedin.com/in/byeolyi-han-17ab20195/;https://core-robotics.gatech.edu/", "dblp": ";;;144/1022", "google_scholar": ";_Md-58AAAAAJ;;Ihyz20wAAAAJ", "orcid": ";;;", "linkedin": ";sean-ye-a64334a4/;;", "or_profile": "~Zixuan_Wu2;~Sean_Charles_Ye1;~Byeolyi_Han1;~Matthew_Gombolay1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;cc.gatech.edu", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwu2023hijacking,\ntitle={Hijacking Robot Teams Through Adversarial Communication},\nauthor={Zixuan Wu and Sean Charles Ye and Byeolyi Han and Matthew Gombolay},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=bIvIUNH9VQ}\n}", "github": "", "project": "", "reviewers": "3YTc;fyCD;ooVv;N9UF", "site": "https://openreview.net/forum?id=bIvIUNH9VQ", "pdf_size": 0, "rating": "10;10;10;10", "confidence": "2;4;3;4", "rating_avg": 10.0, "confidence_avg": 3.25, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:g9agDe2M5wcJ:scholar.google.com/&scioq=Hijacking+Robot+Teams+Through+Adversarial+Communication&hl=en&as_sdt=0,33", "gs_version_total": 2, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "cjEI5qXoT0", "title": "Context-Aware Entity Grounding with Open-Vocabulary 3D Scene Graphs", "track": "main", "status": "Poster", "tldr": "Combing Open-vocabulary feature with 3D scene-graph to enable a context-aware entity locating within a scene.", "abstract": "We present an Open-Vocabulary 3D Scene Graph (OVSG), a formal framework for grounding a variety of entities, such as object instances, agents, and regions, with free-form text-based queries. Unlike conventional semantic-based object localization approaches, our system facilitates context-aware entity localization, allowing for queries such as \u201cpick up a cup on a kitchen table\u201d or \u201cnavigate to a sofa on which someone is sitting\u201d. In contrast to existing research on 3D scene graphs, OVSG supports free-form text input and open-vocabulary querying. Through a series of comparative experiments using the ScanNet dataset and a self-collected dataset, we demonstrate that our proposed approach significantly surpasses the performance of previous semantic-based localization techniques. Moreover, we highlight the practical application of OVSG in real-world robot navigation and manipulation experiments. The code and dataset used for evaluation will be made available upon publication.", "keywords": "Open-Vocabulary Semantic;Scene Graph;Object Grounding", "primary_area": "", "supplementary_material": "/attachment/b71b54839930812924127f44939920c218a14e37.zip", "author": "Haonan Chang;Kowndinya Boyalakuntla;Shiyang Lu;Siwei Cai;Eric Pu Jing;Shreesh Keskar;Shijie Geng;Adeeb Abbas;Lifeng Zhou;Kostas Bekris;Abdeslam Boularias", "authorids": "~Haonan_Chang1;~Kowndinya_Boyalakuntla1;~Shiyang_Lu2;~Siwei_Cai1;~Eric_Pu_Jing1;~Shreesh_Keskar1;~Shijie_Geng1;~Adeeb_Abbas1;~Lifeng_Zhou2;~Kostas_Bekris1;~Abdeslam_Boularias1", "gender": "M;M;M;M;M;M;M;Not Specified;;M;M", "homepage": "https://github.com/changhaonan;https://kowndinya2000.github.io;;https://www.linkedin.com/in/bill-cai/;https://ericjing.com/;;;https://github.com/adeeb10abbas;;https://pracsys.cs.rutgers.edu/members/kostas-bekris/;http://rl.cs.rutgers.edu/", "dblp": ";297/4080;;;;;171/3642;;;42/170;57/2269", "google_scholar": ";tv9b9Y4AAAAJ;pDH5AcsAAAAJ;;;;wujqvGYAAAAJ;-PdPv0IAAAAJ;;https://scholar.google.com.tw/citations?user=gwC7rCUAAAAJ;https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ", "orcid": ";0000-0002-3112-9718;;;;;;;;;", "linkedin": ";kowndinya-boyalakuntla-b4b254155/;shiyang-lu-b204ab106/;;eric-jing-47308b12b/;https://linkedin.com/in/keskarshreesh;;;;kostas-bekris-0b56794/;", "or_profile": "~Haonan_Chang1;~Kowndinya_Boyalakuntla1;~Shiyang_Lu2;~Siwei_Cai1;~Eric_Pu_Jing1;~Shreesh_Keskar1;~Shijie_Geng1;~Adeeb_Abbas1;~Lifeng_Zhou2;~Kostas_Bekris1;~Abdeslam_Boularias1", "aff": "Rutgers, New Brunswick;Rutgers University;Rutgers University - New Brunswick;Drexel University;Rutgers University;Rutgers University;ByteDance Inc.;Northeastern University;;Rutgers University;, Rutgers University", "aff_domain": "scarletmail.rutgers.edu;rutgers.edu;rutgers.edu;drexel.edu;rutgers.edu;cs.rutgers.edu;bytedance.com;northeastern.edu;;rutgers.edu;cs.rutgers.edu", "position": "PhD student;MS student;PhD student;MS student;PhD student;MS student;Researcher;PhD student;;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nchang2023contextaware,\ntitle={Context-Aware Entity Grounding with Open-Vocabulary 3D Scene Graphs},\nauthor={Haonan Chang and Kowndinya Boyalakuntla and Shiyang Lu and Siwei Cai and Eric Pu Jing and Shreesh Keskar and Shijie Geng and Adeeb Abbas and Lifeng Zhou and Kostas Bekris and Abdeslam Boularias},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=cjEI5qXoT0}\n}", "github": "https://github.com/changhaonan/OVSG", "project": "", "reviewers": "zmgd;2AQt;UCiN", "site": "https://openreview.net/forum?id=cjEI5qXoT0", "pdf_size": 0, "rating": "6;6;6", "confidence": "5;4;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 11, "corr_rating_confidence": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8151231049989512768&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;1;0;0;2;3;0;0", "aff_unique_norm": "Rutgers University;Drexel University;ByteDance;Northeastern University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.rutgers.edu;https://www.drexel.edu;https://www.bytedance.com;https://www.northeastern.edu", "aff_unique_abbr": "Rutgers;Drexel;ByteDance;NEU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New Brunswick;", "aff_country_unique_index": "0;0;0;0;0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "ckeT8cMz_A", "title": "REBOOT: Reuse Data for Bootstrapping Efficient Real-World Dexterous Manipulation", "track": "main", "status": "Poster", "tldr": "We learn dexterous manipulation skills on multi-fingered robot hand in the real world autonomously via sample efficient RL by bootstrapping from prior data", "abstract": "Dexterous manipulation tasks involving contact-rich interactions pose a significant challenge for both model-based control systems and imitation learning algorithms. The complexity arises from the need for multi-fingered robotic hands to dynamically establish and break contacts, balance forces on the non-prehensile object, and control a high number of degrees of freedom. Reinforcement learning (RL) offers a promising approach due to its general applicability and capacity to autonomously acquire optimal manipulation strategies. However, its real-world application is often hindered by the necessity to generate a large number of samples, reset the environment, and obtain reward signals. In this work, we introduce an efficient system for learning dexterous manipulation skills with RL to alleviate these challenges. The main idea of our approach is the integration of recent advancements in sample-efficient RL and replay buffer bootstrapping. This unique combination allows us to utilize data from different tasks or objects as a starting point for training new tasks, significantly improving learning efficiency. Additionally, our system completes the real-world training cycle by incorporating learned resets via an imitation-based pickup policy and learned reward functions, to eliminate the need for manual reset and reward engineering. We show the benefits of reusing past data as replay buffer initialization for new tasks, for instance, the fast acquisitions of intricate manipulation skills in the real world on a four-fingered robotic hand. \\href{https://sites.google.com/view/reboot-dexterous}{https://sites.google.com/view/reboot-dexterous})", "keywords": "Dexterous Manipulation;Reinforcement Learning;Sample-Efficient RL", "primary_area": "", "supplementary_material": "", "author": "Zheyuan Hu;Aaron Rovinsky;Jianlan Luo;Vikash Kumar;Abhishek Gupta;Sergey Levine", "authorids": "~Zheyuan_Hu3;~Aaron_Rovinsky1;~Jianlan_Luo1;~Vikash_Kumar2;~Abhishek_Gupta1;~Sergey_Levine1", "gender": ";M;;M;M;M", "homepage": "https://huzheyuan.io/;;https://people.eecs.berkeley.edu/~jianlanluo/;http://vikashplus.github.io/;https://homes.cs.washington.edu/~abhgupta/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";;161/1838;82/7475;18/6404-4;80/7594", "google_scholar": "https://scholar.google.com/citations?hl=en;dJj3vR4AAAAJ;SJoRNbYAAAAJ;nu3W--sAAAAJ;1wLVDP4AAAAJ;8R35rCwAAAAJ", "orcid": "0009-0000-0776-2380;;;;;", "linkedin": "huzheyuan/;aaronrovinsky/;;;;", "or_profile": "~Zheyuan_Hu3;~Aaron_Rovinsky1;~Jianlan_Luo1;~Vikash_Kumar2;~Abhishek_Gupta1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Google;Meta Facebook;University of Washington;Google", "aff_domain": "berkeley.edu;berkeley.edu;google.com;facebook.com;uw.edu;google.com", "position": "Undergrad student;Undergrad student;Researcher;Researcher;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nhu2023reboot,\ntitle={{REBOOT}: Reuse Data for Bootstrapping Efficient Real-World Dexterous Manipulation},\nauthor={Zheyuan Hu and Aaron Rovinsky and Jianlan Luo and Vikash Kumar and Abhishek Gupta and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ckeT8cMz_A}\n}", "github": "", "project": "", "reviewers": "ufd7;puFB;7J5e;tREV", "site": "https://openreview.net/forum?id=ckeT8cMz_A", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;3;4;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 27, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9679974044241611012&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;3;1", "aff_unique_norm": "University of California, Berkeley;Google;Meta;University of Washington", "aff_unique_dep": ";Google;Meta Platforms, Inc.;", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com;https://meta.com;https://www.washington.edu", "aff_unique_abbr": "UC Berkeley;Google;Meta;UW", "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "dIgCPoy8E3", "title": "Cross-Dataset Sensor Alignment: Making Visual 3D Object Detector Generalizable", "track": "main", "status": "Poster", "tldr": "This paper intduces a method to improve the generalization capability of camera-based 3D object detectors for autonomous driving by aligning sensor configurations, resulting in significant performance improvements across multiple datasets.", "abstract": "While camera-based 3D object detection has evolved rapidly, these models are susceptible to overfitting to specific sensor setups. For example, in autonomous driving, most datasets are collected using a single sensor configuration. This paper evaluates the generalization capability of camera-based 3D object detectors, including adapting detectors from one dataset to another and training detectors with multiple datasets. We observe that merely aggregating datasets yields drastic performance drops, contrary to the expected improvements associated with increased training data. To close the gap, we introduce an efficient technique for aligning disparate sensor configurations ---a combination of camera intrinsic synchronization, camera extrinsic correction, and ego frame alignment, which collectively enhance cross-dataset performance remarkably. Compared with single dataset baselines, we achieve 42.3 mAP improvement on KITTI, 23.2 mAP improvement on Lyft, 18.5 mAP improvement on nuScenes, 17.3 mAP improvement on KITTI-360, 8.4 mAP improvement on Argoverse2 and 3.9 mAP improvement on Waymo. We hope this comprehensive study can facilitate research on generalizable 3D object detection and associated tasks.", "keywords": "3D object detection;Model Generalization;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/c7777566d2789a5bb144f50df862f5c578d3297d.zip", "author": "Liangtao Zheng;Yicheng Liu;Yue Wang;Hang Zhao", "authorids": "~Liangtao_Zheng1;~Yicheng_Liu2;~Yue_Wang2;~Hang_Zhao1", "gender": ";M;M;M", "homepage": "https://github.com/ZLTJohn;https://mrmoore98.github.io/liuyicheng/;https://yuewang.xyz;http://www.mit.edu/~hangzhao/", "dblp": ";;33/4822-41;", "google_scholar": ";vRmsgQUAAAAJ;v-AEFIEAAAAJ;DmahiOYAAAAJ", "orcid": ";0000-0003-3211-3088;;", "linkedin": ";;;", "or_profile": "~Liangtao_Zheng1;~Yicheng_Liu2;~Yue_Wang2;~Hang_Zhao1", "aff": "Wuhan University;Tsinghua University;NVIDIA;Tsinghua University", "aff_domain": "whu.edu.cn;mail.tsinghua.edu.cn;nvidia.com;tsinghua.edu.cn", "position": "Undergrad student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzheng2023crossdataset,\ntitle={Cross-Dataset Sensor Alignment: Making Visual 3D Object Detector Generalizable},\nauthor={Liangtao Zheng and Yicheng Liu and Yue Wang and Hang Zhao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=dIgCPoy8E3}\n}", "github": "", "project": "", "reviewers": "NKfY;JvHZ;iJm9;vndD", "site": "https://openreview.net/forum?id=dIgCPoy8E3", "pdf_size": 0, "rating": "4;4;6;10", "confidence": "3;3;4;4", "rating_avg": 6.0, "confidence_avg": 3.5, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.8164965809277259, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18103541838356753023&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Wuhan University;Tsinghua University;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "http://www.whu.edu.cn/;https://www.tsinghua.edu.cn;https://www.nvidia.com", "aff_unique_abbr": "WHU;THU;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "dgwvY3H8PAS", "title": "Dynamic Handover: Throw and Catch with Bimanual Hands", "track": "main", "status": "Poster", "tldr": "We propose a throwing and catching system using bimanual dexterous hands with reinforcement learning.", "abstract": "Humans throw and catch objects all the time. However, such a seemingly common skill introduces a lot of challenges for robots to achieve: The robots need to operate such dynamic actions at high-speed, collaborate precisely, and interact with diverse objects. In this paper, we design a system with two multi-finger hands attached to robot arms to solve this problem. We train our system using Multi-Agent Reinforcement Learning in simulation and perform Sim2Real transfer to deploy on the real robots. To overcome the Sim2Real gap, we provide multiple novel algorithm designs including learning a trajectory prediction model for the object. Such a model can help the robot catcher has a real-time estimation of where the object will be heading, and then react accordingly. We conduct our experiments with multiple objects in the real-world system, and show significant improvements over multiple baselines. Our project page is available at https://binghao-huang.github.io/dynamic_handover/", "keywords": "Bimanual Dexterous Manipulation;Sim-to-Real Transfer", "primary_area": "", "supplementary_material": "/attachment/0d29c2aabeb24df59cea86e33416e50c3aa7c5c9.zip", "author": "Binghao Huang;Yuanpei Chen;Tianyu Wang;Yuzhe Qin;Yaodong Yang;Nikolay Atanasov;Xiaolong Wang", "authorids": "~Binghao_Huang1;~Yuanpei_Chen2;~Tianyu_Wang1;~Yuzhe_Qin1;~Yaodong_Yang1;~Nikolay_Atanasov1;~Xiaolong_Wang3", "gender": ";M;;M;M;;M", "homepage": "https://binghao-huang.github.io/;https://cypypccpy.github.io/;;https://yzqin.github.io/;https://www.yangyaodong.com;http://natanaso.github.io;https://xiaolonw.github.io/", "dblp": ";1234567;;241/9337;170/1496-1;117/2111;91/952-4", "google_scholar": "nqoOetAAAAAJ;https://scholar.google.com/citations?hl=en;;3KF3AIMAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;RTkSatQAAAAJ;Y8O9N_0AAAAJ", "orcid": ";0000-0002-0033-492X;;0000-0002-9321-9305;0000-0001-8132-5613;0000-0003-0272-7580;", "linkedin": ";;;;yaodong-yang;nikolay-atanasov-b034b27;", "or_profile": "~Binghao_Huang1;~Yuanpei_Chen2;~Tianyu_Wang1;~Yuzhe_Qin1;~Yaodong_Yang1;~Nikolay_Atanasov1;~Xiaolong_Wang3", "aff": "University of California, San Diego;South China University of Technology;University of California, San Diego;University of California, San Diego;Peking University;University of California, San Diego;University of California, San Diego", "aff_domain": "ucsd.edu;scut.edu.cn;ucsd.edu;ucsd.edu;pku.edu.cn;ucsd.edu;ucsd.edu", "position": "MS student;Undergrad student;PhD student;PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2023dynamic,\ntitle={Dynamic Handover: Throw and Catch with Bimanual Hands},\nauthor={Binghao Huang and Yuanpei Chen and Tianyu Wang and Yuzhe Qin and Yaodong Yang and Nikolay Atanasov and Xiaolong Wang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=dgwvY3H8PAS}\n}", "github": "", "project": "", "reviewers": "PiJr;GCGh;wGRc;VRgh", "site": "https://openreview.net/forum?id=dgwvY3H8PAS", "pdf_size": 0, "rating": "6;10;10;10", "confidence": "4;4;4;4", "rating_avg": 9.0, "confidence_avg": 4.0, "replies_avg": 25, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 50, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16663729793116650827&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0;2;0;0", "aff_unique_norm": "University of California, San Diego;South China University of Technology;Peking University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.scut.edu.cn;http://www.pku.edu.cn", "aff_unique_abbr": "UCSD;SCUT;Peking U", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "diOr96f65N", "title": "Quantifying Assistive Robustness Via the Natural-Adversarial Frontier", "track": "main", "status": "Poster", "tldr": "We propose a novel framework for evaluating the robustness of robot policies in assistive settings.", "abstract": "Our ultimate goal is to build robust policies for robots that assist people. What makes this hard is that people can behave unexpectedly at test time, potentially interacting with the robot outside its training distribution and leading to failures. Even just measuring robustness is a challenge. Adversarial perturbations are the default, but they can paint the wrong picture: they can correspond to human motions that are unlikely to occur during natural interactions with people. A robot policy might fail under small adversarial perturbations but work under large natural perturbations. We propose that capturing robustness in these interactive settings requires constructing and analyzing the entire natural-adversarial frontier: the Pareto-frontier of human policies that are the best trade-offs between naturalness and low robot performance. We introduce RIGID, a method for constructing this frontier by training adversarial human policies that trade off between minimizing robot reward and acting human-like (as measured by a discriminator). On an Assistive Gym task, we use RIGID to analyze the performance of standard collaborative RL, as well as the performance of existing methods meant to increase robustness. We also compare the frontier RIGID identifies with the failures identified in expert adversarial interaction, and with naturally-occurring failures during user interaction. Overall, we find evidence that RIGID can provide a meaningful measure of robustness predictive of deployment performance, and uncover failure cases that are difficult to find manually.", "keywords": "assistive robots;safety;human-robot interaction;adversarial robustness", "primary_area": "", "supplementary_material": "/attachment/2ab3873b9e1bd0e40efd2dcb9679867a8a87b5a3.zip", "author": "Jerry Zhi-Yang He;Daniel S. Brown;Zackory Erickson;Anca Dragan", "authorids": "~Jerry_Zhi-Yang_He1;~Daniel_S._Brown1;~Zackory_Erickson1;~Anca_Dragan1", "gender": "M;M;M;F", "homepage": "https://herobotics.me;https://www.cs.utah.edu/~dsbrown/;https://zackory.com;http://www.ancadragan.com/", "dblp": ";141/7769;;", "google_scholar": ";https://scholar.google.com/citations?hl=en;wElkTtIAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Jerry_Zhi-Yang_He1;~Daniel_S._Brown1;~Zackory_Erickson1;~Anca_Dragan1", "aff": ";University of Utah;Carnegie Mellon University;University of California, Berkeley", "aff_domain": ";utah.edu;cmu.edu;berkeley.edu", "position": ";Assistant Professor;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhe2023quantifying,\ntitle={Quantifying Assistive Robustness Via the Natural-Adversarial Frontier},\nauthor={Jerry Zhi-Yang He and Daniel S. Brown and Zackory Erickson and Anca Dragan},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=diOr96f65N}\n}", "github": "", "project": "", "reviewers": "KfMg;KeGh;wTf2;XUqi", "site": "https://openreview.net/forum?id=diOr96f65N", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;4;2;3", "rating_avg": 6.5, "confidence_avg": 3.5, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.5129891760425771, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:n9syLy-ssesJ:scholar.google.com/&scioq=Quantifying+Assistive+Robustness+Via+the+Natural-Adversarial+Frontier&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Utah;Carnegie Mellon University;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.utah.edu;https://www.cmu.edu;https://www.berkeley.edu", "aff_unique_abbr": "Utah;CMU;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "dk-2R1f_LR", "title": "MimicGen: A Data Generation System for Scalable Robot Learning using Human Demonstrations", "track": "main", "status": "Poster", "tldr": "We introduce MimicGen, a system for automatically synthesizing large-scale datasets from a small number of human demonstrations by adapting them to new scene configurations, object instances, and robot arms.", "abstract": "Imitation learning from a large set of human demonstrations has proved to be an effective paradigm for building capable robot agents. However, the demonstrations can be extremely costly and time-consuming to collect. We introduce MimicGen, a system for automatically synthesizing large-scale, rich datasets from only a small number of human demonstrations by adapting them to new contexts. We use MimicGen to generate over 50K demonstrations across 18 tasks with diverse scene configurations, object instances, and robot arms from just ~200 human demonstrations. We show that robot agents can be effectively trained on this generated dataset by imitation learning to achieve strong performance in long-horizon and high-precision tasks, such as multi-part assembly and coffee preparation, across broad initial state distributions. We further demonstrate that the effectiveness and utility of MimicGen data compare favorably to collecting additional human demonstrations, making it a powerful and economical approach towards scaling up robot learning. Datasets, simulation environments, videos, and more at https://mimicgen.github.io .", "keywords": "Imitation Learning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/6e2f4af1d65f7020581fcff1dddfa303212475a8.zip", "author": "Ajay Mandlekar;Soroush Nasiriany;Bowen Wen;Iretiayo Akinola;Yashraj Narang;Linxi Fan;Yuke Zhu;Dieter Fox", "authorids": "~Ajay_Mandlekar1;~Soroush_Nasiriany1;~Bowen_Wen1;~Iretiayo_Akinola1;~Yashraj_Narang1;~Linxi_Fan2;~Yuke_Zhu1;~Dieter_Fox1", "gender": "M;;;M;M;;M;M", "homepage": "https://ai.stanford.edu/~amandlek/;http://snasiriany.me/;https://wenbowen123.github.io/;;;;https://cs.utexas.edu/~yukez/;https://homes.cs.washington.edu/~fox/", "dblp": "https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay;185/5645;;;215/6022.html;154/6778;133/1772;f/DieterFox", "google_scholar": "MEz23joAAAAJ;bBLqsgkAAAAJ;VSG7Z0kAAAAJ;e1zesfMAAAAJ;M3NuG7AAAAAJ;sljtWIUAAAAJ;mWGyYMsAAAAJ;DqXsbPAAAAAJ", "orcid": ";;;;0000-0001-5445-3759;;;", "linkedin": ";;bowen-wen/;;;;;", "or_profile": "~Ajay_Mandlekar1;~Soroush_Nasiriany1;~Bowen_Wen1;~Iretiayo_Akinola1;~Yashraj_Narang1;~Linxi_Fan2;~Yuke_Zhu1;~Dieter_Fox1", "aff": "NVIDIA;University of Texas, Austin;NVIDIA;NVIDIA;NVIDIA;NVIDIA;Computer Science Department, University of Texas, Austin;Department of Computer Science", "aff_domain": "nvidia.com;utexas.edu;nvidia.com;nvidia.com;nvidia.com;nvidia.com;cs.utexas.edu;cs.washington.edu", "position": "Researcher;PhD student;Researcher;Researcher;Researcher;Researcher;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nmandlekar2023mimicgen,\ntitle={MimicGen: A Data Generation System for Scalable Robot Learning using Human Demonstrations},\nauthor={Ajay Mandlekar and Soroush Nasiriany and Bowen Wen and Iretiayo Akinola and Yashraj Narang and Linxi Fan and Yuke Zhu and Dieter Fox},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=dk-2R1f_LR}\n}", "github": "https://github.com/NVlabs/mimicgen_environments", "project": "", "reviewers": "GHn9;pQft;Tm7E;f5G9", "site": "https://openreview.net/forum?id=dk-2R1f_LR", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;4;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 17, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 120, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9637692045386700913&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0;0;0;1;2", "aff_unique_norm": "NVIDIA;University of Texas at Austin;Unknown Institution", "aff_unique_dep": "NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www.utexas.edu;", "aff_unique_abbr": "NVIDIA;UT Austin;", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "dxOaNO8bge", "title": "A Data-Efficient Visual-Audio Representation with Intuitive Fine-tuning for Voice-Controlled Robots", "track": "main", "status": "Poster", "tldr": "", "abstract": "A command-following robot that serves people in everyday life must continually improve itself in deployment domains with minimal help from its end users, instead of engineers. Previous methods are either difficult to continuously improve after the deployment or require a large number of new labels during fine-tuning. Motivated by (self-)supervised contrastive learning, we propose a novel representation that generates an intrinsic reward function for command-following robot tasks by associating images with sound commands.\nAfter the robot is deployed in a new domain, the representation can be updated intuitively and data-efficiently by non-experts without any hand-crafted reward functions. We demonstrate our approach on various sound types and robotic tasks, including navigation and manipulation with raw sensor inputs. In simulated and real-world experiments, we show that our system can continually self-improve in previously unseen scenarios given fewer new labeled data, while still achieving better performance over previous methods.", "keywords": "Command Following;Multimodal Representation;Reinforcement Learning;Human-in-the-Loop", "primary_area": "", "supplementary_material": "/attachment/655dccd3062446f97774fe5fc82e0109a1a064f9.zip", "author": "Peixin Chang;Shuijing Liu;Tianchen Ji;Neeloy Chakraborty;Kaiwen Hong;Katherine Rose Driggs-Campbell", "authorids": "~Peixin_Chang1;~Shuijing_Liu1;~Tianchen_Ji1;~Neeloy_Chakraborty1;~Kaiwen_Hong1;~Katherine_Rose_Driggs-Campbell1", "gender": "M;F;M;M;M;", "homepage": ";https://shuijing725.github.io;https://tianchenji.github.io/;https://theneeloy.github.io/;;", "dblp": ";211/7210;;278/2404;;", "google_scholar": "0AloliwAAAAJ;I4k7ukgAAAAJ;9XgufxkAAAAJ;Fwc4xyEAAAAJ;;", "orcid": ";;;0000-0001-7132-6671;;", "linkedin": ";shuijing-liu-4089b3123;;neeloy-chakraborty/;kaiwen-hong-524520141/;", "or_profile": "~Peixin_Chang1;~Shuijing_Liu1;~Tianchen_Ji1;~Neeloy_Chakraborty1;~Kaiwen_Hong1;~Katherine_Rose_Driggs-Campbell1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;UIUC;", "aff_domain": "uiuc.edu;uiuc.edu;uiuc.edu;uiuc.edu;illinois.edu;", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;", "bibtex": "@inproceedings{\nchang2023a,\ntitle={A Data-Efficient Visual-Audio Representation with Intuitive Fine-tuning for Voice-Controlled Robots},\nauthor={Peixin Chang and Shuijing Liu and Tianchen Ji and Neeloy Chakraborty and Kaiwen Hong and Katherine Rose Driggs-Campbell},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=dxOaNO8bge}\n}", "github": "", "project": "", "reviewers": "TXeX;HeXC;b57y;Tifq;S2KB", "site": "https://openreview.net/forum?id=dxOaNO8bge", "pdf_size": 0, "rating": "4;6;6;6;10", "confidence": "3;3;4;4;3", "rating_avg": 6.4, "confidence_avg": 3.4, "replies_avg": 15, "authors#_avg": 6, "corr_rating_confidence": -0.16666666666666663, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7102386573359670346&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "eE3fsO5Mi2", "title": "Stealthy Terrain-Aware Multi-Agent Active Search", "track": "main", "status": "Poster", "tldr": "Leveraging a known terrain map and Thompson sampling based active learning to set new state of the art in search efficiency with minimal visibility risk in realistic search settings.", "abstract": "Stealthy multi-agent active search is the problem of making efficient sequential data-collection decisions to identify an unknown number of sparsely located targets while adapting to new sensing information and concealing the search agents' location from the targets. This problem is applicable to reconnaissance tasks wherein the safety of the search agents can be compromised as the targets may be adversarial. Prior work usually focuses either on adversarial search, where the risk of revealing the agents' location to the targets is ignored or evasion strategies where efficient search is ignored. We present the Stealthy Terrain-Aware Reconnaissance (STAR) algorithm, a multi-objective parallelized Thompson sampling-based algorithm that relies on a strong topographical prior to reason over changing visibility risk over the course of the search. The STAR algorithm outperforms existing state-of-the-art multi-agent active search methods on both rate of recovery of targets as well as minimising risk even when subject to noisy observations, communication failures and an unknown number of targets.", "keywords": "Reconnaissance;Adversarial Search;Multi-robot;Active Learning", "primary_area": "", "supplementary_material": "/attachment/11e1702c12e0aca4f5aa264833529d15a678f4d5.zip", "author": "Nikhil Angad Bakshi;Jeff Schneider", "authorids": "~Nikhil_Angad_Bakshi1;~Jeff_Schneider1", "gender": "M;", "homepage": ";https://www.cs.cmu.edu/~schneide", "dblp": ";38/247", "google_scholar": "bZrOaXgAAAAJ;3bSbb20AAAAJ", "orcid": ";0000-0002-5080-9073", "linkedin": "nikhil-angad-bakshi/;jeff-schneider-1593b322/", "or_profile": "~Nikhil_Angad_Bakshi1;~Jeff_Schneider1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu", "position": "MS student;Researcher", "bibtex": "@inproceedings{\nbakshi2023stealthy,\ntitle={Stealthy Terrain-Aware Multi-Agent Active Search},\nauthor={Nikhil Angad Bakshi and Jeff Schneider},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=eE3fsO5Mi2}\n}", "github": "https://github.com/bakshienator77/Stealthy-Terrain-Aware-Reconnaissance-and-Search", "project": "", "reviewers": "o4rp;AFR3;ajuq;b4aX", "site": "https://openreview.net/forum?id=eE3fsO5Mi2", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;2;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 22, "authors#_avg": 2, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12583806064340482540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "efaE7iJ2GJv", "title": "PolarNet: 3D Point Clouds for Language-Guided Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability for robots to comprehend and execute manipulation tasks based on natural language instructions is a long-term goal in robotics. The dominant approaches for language-guided manipulation use 2D image representations, which face difficulties in combining multi-view cameras and inferring precise 3D positions and relationships. To address these limitations, we propose a 3D point cloud based policy called PolarNet for language-guided manipulation. It leverages carefully designed point cloud inputs, efficient point cloud encoders, and multimodal transformers to learn 3D point cloud representations and integrate them with language instructions for action prediction.\nPolarNet is shown to be effective and data efficient in a variety of experiments conducted on the RLBench benchmark. It outperforms state-of-the-art 2D and 3D approaches in both single-task and multi-task learning. It also achieves promising results on a real robot.", "keywords": "Robotic manipulation;3D point clouds;language-guided policy", "primary_area": "", "supplementary_material": "/attachment/e69c1890f97f010cebab7224d311813e13ceeca2.zip", "author": "Shizhe Chen;Ricardo Garcia Pinel;Cordelia Schmid;Ivan Laptev", "authorids": "~Shizhe_Chen1;~Ricardo_Garcia_Pinel1;~Cordelia_Schmid1;~Ivan_Laptev1", "gender": "F;M;F;M", "homepage": "https://cshizhe.github.io/;https://rjgpinel.github.io/;https://cordeliaschmid.github.io/;https://www.di.ens.fr/~laptev/", "dblp": "153/0734;304/1714;s/CordeliaSchmid;41/1854", "google_scholar": "wZhRRy0AAAAJ;cMA5vJwAAAAJ;IvqCXP4AAAAJ;https://scholar.google.com.tw/citations?user=-9ifK0cAAAAJ", "orcid": ";0000-0002-2553-7272;;", "linkedin": ";rjgpinel;cordelia-schmid-47985a9;", "or_profile": "~Shizhe_Chen1;~Ricardo_Garcia_Pinel1;~Cordelia_Schmid1;~Ivan_Laptev1", "aff": "INRIA;INRIA;Inria;INRIA Paris", "aff_domain": "inria.fr;inria.fr;inria.fr;inria.fr", "position": "Postdoc;PhD student;Researcher;Senior Researcher", "bibtex": "@inproceedings{\nchen2023polarnet,\ntitle={PolarNet: 3D Point Clouds for Language-Guided Robotic Manipulation},\nauthor={Shizhe Chen and Ricardo Garcia Pinel and Cordelia Schmid and Ivan Laptev},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=efaE7iJ2GJv}\n}", "github": "", "project": "", "reviewers": "dUV7;2cU3;ABTj;mm9b", "site": "https://openreview.net/forum?id=efaE7iJ2GJv", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;4;3;3", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11665016862043823750&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 10, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "INRIA", "aff_unique_dep": "", "aff_unique_url": "https://www.inria.fr", "aff_unique_abbr": "INRIA", "aff_campus_unique_index": "1", "aff_campus_unique": ";Paris", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "id": "eyykI3UIHa", "title": "NOIR: Neural Signal Operated Intelligent Robots for Everyday Activities", "track": "main", "status": "Poster", "tldr": "We present a general-purpose, intelligent brain-robot interface system that enables humans to command robots to perform everyday activities through brain signals.", "abstract": "We present Neural Signal Operated Intelligent Robots (NOIR), a general-purpose, intelligent brain-robot interface system that enables humans to command robots to perform everyday activities through brain signals. Through this interface, humans communicate their intended objects of interest and actions to the robots using electroencephalography (EEG). Our novel system demonstrates success in an expansive array of 20 challenging, everyday household activities, including cooking, cleaning, personal care, and entertainment. The effectiveness of the system is improved by its synergistic integration of robot learning algorithms, allowing for NOIR to adapt to individual users and predict their intentions. Our work enhances the way humans interact with robots, replacing traditional channels of interaction with direct, neural communication.", "keywords": "Brain-Robot Interface;Human-Robot Interaction", "primary_area": "", "supplementary_material": "/attachment/a25205306e2bc21da558f13e5ed4e63fc0c9c631.zip", "author": "Ruohan Zhang;Sharon Lee;Minjune Hwang;Ayano Hiranaka;Chen Wang;Wensi Ai;Jin Jie Ryan Tan;Shreya Gupta;Yilun Hao;Gabrael Levine;Ruohan Gao;Anthony Norcia;Li Fei-Fei;Jiajun Wu", "authorids": "~Ruohan_Zhang1;~Sharon_Lee1;~Minjune_Hwang1;~Ayano_Hiranaka1;~Chen_Wang16;~Wensi_Ai1;~Jin_Jie_Ryan_Tan1;~Shreya_Gupta1;~Yilun_Hao1;~Gabrael_Levine1;~Ruohan_Gao2;~Anthony_Norcia1;~Li_Fei-Fei1;~Jiajun_Wu1", "gender": "M;F;M;;M;M;M;;;;M;M;F;M", "homepage": "https://ai.stanford.edu/~zharu/;;https://mj-hwang.github.io/;;http://www.chenwangjeremy.net/;https://wensi-ai.github.io;;;https://yih301.github.io;https://gabrael.io;https://ruohangao.github.io/;https://svndl.stanford.edu/;https://profiles.stanford.edu/fei-fei-li;https://jiajunwu.com", "dblp": ";51/758;263/9824;;;279/3054.html;;;285/4024;;176/5787;;79/2528;117/4768", "google_scholar": "-bqvNWoAAAAJ;jGwt3mcAAAAJ;juBEoEUAAAAJ;;lStkAzsAAAAJ;sjhu_wUAAAAJ;;;RjQF17YAAAAJ;;i02oEgMAAAAJ;;rDfyQnIAAAAJ;2efgcS0AAAAJ", "orcid": ";;;;;;;;;;0000-0002-8346-1114;;;0000-0002-4176-343X", "linkedin": ";sharonleeyen;minjune-hwang-751999138/;;;wensi-ai;ryantjj;shreya-gupta-08/;yilun-hao-86554a178/;;;;fei-fei-li-4541247/;jiajunwu/", "or_profile": "~Ruohan_Zhang1;~Sharon_Lee1;~Minjune_Hwang1;~Ayano_Hiranaka1;~Chen_Wang16;~Wensi_Ai1;~Jin_Jie_Ryan_Tan1;~Shreya_Gupta1;~Yilun_Hao1;~Gabrael_Levine1;~Ruohan_Gao2;~Anthony_Norcia1;~Li_Fei-Fei1;~Jiajun_Wu1", "aff": "Stanford University;;Stanford University;;Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;;stanford.edu;;cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu;cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "Postdoc;;MS student;;PhD student;MS student;Undergrad student;MS student;MS student;Undergrad student;Postdoc;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023noir,\ntitle={{NOIR}: Neural Signal Operated Intelligent Robots for Everyday Activities},\nauthor={Ruohan Zhang and Sharon Lee and Minjune Hwang and Ayano Hiranaka and Chen Wang and Wensi Ai and Jin Jie Ryan Tan and Shreya Gupta and Yilun Hao and Gabrael Levine and Ruohan Gao and Anthony Norcia and Li Fei-Fei and Jiajun Wu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=eyykI3UIHa}\n}", "github": "", "project": "", "reviewers": "KKsW;SXKL;beZJ;r85X", "site": "https://openreview.net/forum?id=eyykI3UIHa", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;3;4;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 14, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1363899273957823351&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "f55MlAT1Lu", "title": "BridgeData V2: A Dataset for Robot Learning at Scale", "track": "main", "status": "Poster", "tldr": "We introduce a new dataset of robotic behaviors designed to facilitate research in large-scale robot learning.", "abstract": "We introduce BridgeData V2, a large and diverse dataset of robotic manipulation behaviors designed to facilitate research in scalable robot learning. BridgeData V2 contains 53,896 trajectories collected across 24 environments on a publicly available low-cost robot. Unlike many existing robotic manipulation datasets, BridgeData V2 provides enough task and environment variability that skills learned from the data generalize across institutions, making the dataset a useful resource for a broad range of researchers. Additionally, the dataset is compatible with a wide variety of open-vocabulary, multi-task learning methods conditioned on goal images or natural language instructions. In our experiments,we apply 6 state-of-the-art imitation learning and offline reinforcement learning methods to the data and find that they succeed on a suite of tasks requiring varying amounts of generalization. We also demonstrate that the performance of these methods improves with more data and higher capacity models. By publicly sharing BridgeData V2 and our pre-trained models, we aim to accelerate research in scalable robot learning methods.", "keywords": "Datasets;Manipulation;Imitation Learning;Offline Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/e4c1814befcf9d96f52cfb384f016da11cfa1f49.zip", "author": "Homer Rich Walke;Kevin Black;Tony Z. Zhao;Quan Vuong;Chongyi Zheng;Philippe Hansen-Estruch;Andre Wang He;Vivek Myers;Moo Jin Kim;Max Du;Abraham Lee;Kuan Fang;Chelsea Finn;Sergey Levine", "authorids": "~Homer_Rich_Walke1;~Kevin_Black2;~Tony_Z._Zhao1;~Quan_Vuong2;~Chongyi_Zheng1;~Philippe_Hansen-Estruch1;~Andre_Wang_He1;~Vivek_Myers1;~Moo_Jin_Kim1;~Max_Du1;~Abraham_Lee2;~Kuan_Fang3;~Chelsea_Finn1;~Sergey_Levine1", "gender": "M;;;M;M;;M;;M;M;M;;F;M", "homepage": "https://homerwalke.com;https://kevin.black;https://tonyzhaozh.github.io/;https://quanvuong.github.io;https://chongyi-zheng.github.io;;;https://people.eecs.berkeley.edu/~vmyers/;https://moojink.com;https://www.maximiliandu.com/;;;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "279/6795;66/9687;;;250/9267;;318/3206.html;270/8694;;239/8546;;;131/1783;80/7594", "google_scholar": "ZWH5jCwAAAAJ;axX7PCwAAAAJ;;NSWI3OwAAAAJ;bezWXYcAAAAJ;;;5NGAbT4AAAAJ;ZKRs0oEAAAAJ;nUTqrPkAAAAJ;;;vfPE6hgAAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;;;;;;;;;", "linkedin": ";;;;;;andre-he-08778219a/;;moojink/;;abraham-lee-4a0497242/;;;", "or_profile": "~Homer_Rich_Walke1;~Kevin_Black2;~Tony_Z._Zhao1;~Quan_Vuong2;~Chongyi_Zheng1;~Philippe_Hansen-Estruch1;~Andre_Wang_He1;~Vivek_Myers1;~Moo_Jin_Kim1;~Max_Du1;~Abraham_Lee2;~Kuan_Fang3;~Chelsea_Finn1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Stanford University;;Carnegie Mellon University;;UC Berkeley, University of California, Berkeley;University of California, Berkeley;Stanford University;Stanford University;University of California, Berkeley;;Google;Google", "aff_domain": "berkeley.edu;berkeley.edu;stanford.edu;;andrew.cmu.edu;;cs.berkeley.edu;berkeley.edu;stanford.edu;stanford.edu;berkeley.edu;;google.com;google.com", "position": "PhD student;PhD student;PhD student;;MS student;;Undergrad student;PhD student;PhD student;Undergrad student;Undergrad student;;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nwalke2023bridgedata,\ntitle={BridgeData V2: A Dataset for Robot Learning at Scale},\nauthor={Homer Rich Walke and Kevin Black and Tony Z. Zhao and Quan Vuong and Chongyi Zheng and Philippe Hansen-Estruch and Andre Wang He and Vivek Myers and Moo Jin Kim and Max Du and Abraham Lee and Kuan Fang and Chelsea Finn and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=f55MlAT1Lu}\n}", "github": "https://github.com/rail-berkeley/bridge_data_v2", "project": "", "reviewers": "SSre;dwgC;AXMo;xJRp", "site": "https://openreview.net/forum?id=f55MlAT1Lu", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;5", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 17, "authors#_avg": 14, "corr_rating_confidence": 0.9271726499455306, "gs_citation": 168, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17621191340486364101&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;0;0;1;1;0;3;3", "aff_unique_norm": "University of California, Berkeley;Stanford University;Carnegie Mellon University;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu;https://www.cmu.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Stanford;CMU;Google", "aff_campus_unique_index": "0;0;1;0;0;1;1;0;3;3", "aff_campus_unique": "Berkeley;Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "fNLBmtyBiC", "title": "A Bayesian approach to breaking things: efficiently predicting and repairing failure modes via sampling", "track": "main", "status": "Poster", "tldr": "By reframing adversarial optimization as Bayesian inference, we can efficiently predict and repair failure modes in autonomous systems.", "abstract": "Before autonomous systems can be deployed in safety-critical applications, we must be able to understand and verify the safety of these systems. For cases where the risk or cost of real-world testing is prohibitive, we propose a simulation-based framework for a) predicting ways in which an autonomous system is likely to fail and b) automatically adjusting the system's design to preemptively mitigate those failures. We frame this problem through the lens of approximate Bayesian inference and use differentiable simulation for efficient failure case prediction and repair. We apply our approach on a range of robotics and control problems, including optimizing search patterns for robot swarms and reducing the severity of outages in power transmission networks. Compared to optimization-based falsification techniques, our method predicts a more diverse, representative set of failure modes, and we also find that our use of differentiable simulation yields solutions that have up to 10x lower cost and requires up to 2x fewer iterations to converge relative to gradient-free techniques.", "keywords": "Automatic design tools;root-cause failure analysis;optimization-as-inference", "primary_area": "", "supplementary_material": "/attachment/2c81cc694a1d6b1c40bd5a131f34a492424011ca.zip", "author": "Charles Dawson;Chuchu Fan", "authorids": "~Charles_Dawson1;~Chuchu_Fan2", "gender": "M;F", "homepage": "https://dawson.mit.edu;https://chuchu.mit.edu", "dblp": "39/1246;127/1756", "google_scholar": "FkDdz9gAAAAJ;J-dq_8EAAAAJ", "orcid": "0000-0002-8371-5313;", "linkedin": "c6d5;chuchu-fan/", "or_profile": "~Charles_Dawson1;~Chuchu_Fan2", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ndawson2023a,\ntitle={A Bayesian approach to breaking things: efficiently predicting and repairing failure modes via sampling},\nauthor={Charles Dawson and Chuchu Fan},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=fNLBmtyBiC}\n}", "github": "https://github.com/MIT-REALM/architect_corl_23", "project": "", "reviewers": "meR7;fucE;nExT;GT1s", "site": "https://openreview.net/forum?id=fNLBmtyBiC", "pdf_size": 0, "rating": "1;6;6;6", "confidence": "4;3;4;3", "rating_avg": 4.75, "confidence_avg": 3.5, "replies_avg": 16, "authors#_avg": 2, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11878627911532414062&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "fSmkKmWM5Ry", "title": "Stochastic Occupancy Grid Map Prediction in Dynamic Scenes", "track": "main", "status": "Poster", "tldr": "We use techniques from generative AI to create a probabilistic prediction of the future to improve mobile robot navigation in dynamic spaces", "abstract": "This paper presents two variations of a novel stochastic prediction algorithm that enables mobile robots to accurately and robustly predict the future state of complex dynamic scenes. The proposed algorithm uses a variational autoencoder to predict a range of possible future states of the environment. The algorithm takes full advantage of the motion of the robot itself, the motion of dynamic objects, and the geometry of static objects in the scene to improve prediction accuracy. Three simulated and real-world datasets collected by different robot models are used to demonstrate that the proposed algorithm is able to achieve more accurate and robust prediction performance than other prediction algorithms. Furthermore, a predictive uncertainty-aware planner is proposed to demonstrate the effectiveness of the proposed predictor in simulation and real-world navigation experiments. Implementations are open source at https://github.com/TempleRAIL/SOGMP.", "keywords": "Environment Prediction;Probabilistic Inference;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/e7647f09f803b1738b5431b706c83e025fe1a892.zip", "author": "Zhanteng Xie;Philip Dames", "authorids": "~Zhanteng_Xie1;~Philip_Dames1", "gender": "M;M", "homepage": "https://sites.google.com/site/zhantengxie;https://sites.temple.edu/trail/", "dblp": ";125/5540", "google_scholar": "zeO96twAAAAJ;s4yerDQAAAAJ", "orcid": "0000-0002-5442-1252;0000-0002-7257-0075", "linkedin": "zhantengxie;philip-dames-8b123b191/", "or_profile": "~Zhanteng_Xie1;~Philip_Dames1", "aff": "Temple University;Temple University", "aff_domain": "temple.edu;temple.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxie2023stochastic,\ntitle={Stochastic Occupancy Grid Map Prediction in Dynamic Scenes},\nauthor={Zhanteng Xie and Philip Dames},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=fSmkKmWM5Ry}\n}", "github": "https://github.com/TempleRAIL/SOGMP", "project": "", "reviewers": "iQee;M8W2;Kpfb;QuXe", "site": "https://openreview.net/forum?id=fSmkKmWM5Ry", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;3;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 7, "authors#_avg": 2, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17347299089058508221&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Temple University", "aff_unique_dep": "", "aff_unique_url": "https://www.temple.edu", "aff_unique_abbr": "Temple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "fa7FzDjhzs9", "title": "HACMan: Learning Hybrid Actor-Critic Maps for 6D Non-Prehensile Manipulation", "track": "main", "status": "Oral", "tldr": "We propose a reinforcement learning framework with an object-centric action representation defined over point cloud observations and evaluate it with a 6D non-prehensile manipulation task.", "abstract": "Manipulating objects without grasping them is an essential component of human dexterity, referred to as non-prehensile manipulation. Non-prehensile manipulation may enable more complex interactions with the objects, but also presents challenges in reasoning about gripper-object interactions. In this work, we introduce Hybrid Actor-Critic Maps for Manipulation (HACMan), a reinforcement learning approach for 6D non-prehensile manipulation of objects using point cloud observations. HACMan proposes a temporally-abstracted and spatially-grounded object-centric action representation that consists of selecting a contact location from the object point cloud and a set of motion parameters describing how the robot will move after making contact. We modify an existing off-policy RL algorithm to learn in this hybrid discrete-continuous action representation. We evaluate HACMan on a 6D object pose alignment task in both simulation and in the real world. On the hardest version of our task, with randomized initial poses, randomized 6D goals, and diverse object categories, our policy demonstrates strong generalization to unseen object categories without a performance drop, achieving an 89% success rate on unseen objects in simulation and 50% success rate with zero-shot transfer in the real world. Compared to alternative action representations, HACMan achieves a success rate more than three times higher than the best baseline. With zero-shot sim2real transfer, our policy can successfully manipulate unseen objects in the real world for challenging non-planar goals, using dynamic and contact-rich non-prehensile skills. Videos can be found on the project website: https://hacman-2023.github.io.", "keywords": "Action Representation;Reinforcement Learning with 3D Vision;Non-prehensile Manipulation", "primary_area": "", "supplementary_material": "/attachment/ef08aabd574dfda914ef168bf3899467284098c9.zip", "author": "Wenxuan Zhou;Bowen Jiang;Fan Yang;Chris Paxton;David Held", "authorids": "~Wenxuan_Zhou1;~Bowen_Jiang4;~Fan_Yang13;~Chris_Paxton1;~David_Held1", "gender": "F;M;M;M;M", "homepage": "https://wenxuan-zhou.github.io/;;https://fanyangr.github.io;https://cpaxton.github.io/;http://davheld.github.io/", "dblp": ";;;;22/11147", "google_scholar": "picvdvEAAAAJ;;qWoep9AAAAAJ;I1mOQpAAAAAJ;0QtU-NsAAAAJ", "orcid": ";;;;", "linkedin": ";bwww/;;;", "or_profile": "~Wenxuan_Zhou1;~Bowen_Jiang4;~Fan_Yang13;~Chris_Paxton1;~David_Held1", "aff": "Meta AI;Carnegie Mellon University;Carnegie Mellon University;Meta Platforms;Carnegie Mellon University", "aff_domain": "meta.com;andrew.cmu.edu;andrew.cmu.edu;meta.com;cmu.edu", "position": "Intern;MS student;MS student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhou2023hacman,\ntitle={{HACM}an: Learning Hybrid Actor-Critic Maps for 6D Non-Prehensile Manipulation},\nauthor={Wenxuan Zhou and Bowen Jiang and Fan Yang and Chris Paxton and David Held},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=fa7FzDjhzs9}\n}", "github": "", "project": "", "reviewers": "3gnN;sYWT;h5wU;Qqvw", "site": "https://openreview.net/forum?id=fa7FzDjhzs9", "pdf_size": 0, "rating": "6;10;10;10", "confidence": "5;4;5;4", "rating_avg": 9.0, "confidence_avg": 4.5, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11559198988304780173&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Meta;Carnegie Mellon University", "aff_unique_dep": "Meta AI;", "aff_unique_url": "https://meta.com;https://www.cmu.edu", "aff_unique_abbr": "Meta;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "flyQ0v8cgC", "title": "Continual Vision-based Reinforcement Learning with Group Symmetries", "track": "main", "status": "Oral", "tldr": "We propose a task-agnostic vision-based continual RL algorithm that grows a policy for each task group that contains equivariant tasks, instead of a single task, and automatically detects task group delineations in an unsupervised manner.", "abstract": "Continual reinforcement learning aims to sequentially learn a variety of tasks, retaining the ability to perform previously encountered tasks while simultaneously developing new policies for novel tasks. However, current continual RL approaches overlook the fact that certain tasks are identical under basic group operations like rotations or translations, especially with visual inputs. They may unnecessarily learn and maintain a new policy for each similar task, leading to poor sample efficiency and weak generalization capability. To address this, we introduce a unique Continual Vision-based Reinforcement Learning method that recognizes Group Symmetries, called COVERS, cultivating a policy for each group of equivalent tasks rather than an individual task. COVERS employs a proximal-policy-gradient-based (PPO-based) algorithm to train each policy, which contains an equivariant feature extractor and takes inputs with different modalities, including image observations and robot proprioceptive states. It also utilizes an unsupervised task grouping mechanism that relies on 1-Wasserstein distance on the extracted invariant features. We evaluate COVERS on a sequence of table-top manipulation tasks in simulation and on a real robot platform. Our results show that COVERS accurately assigns tasks to their respective groups and significantly outperforms baselines by generalizing to unseen but equivariant tasks in seen task groups. Demos are available on our project page: https://sites.google.com/view/rl-covers/.", "keywords": "Continual Learning;Symmetry;Manipulation", "primary_area": "", "supplementary_material": "/attachment/30ae0e9a3522c326579315e1549344e796e99c70.zip", "author": "Shiqi Liu;Mengdi Xu;Peide Huang;Xilun Zhang;Yongkang Liu;Kentaro Oguchi;Ding Zhao", "authorids": "~Shiqi_Liu2;~Mengdi_Xu3;~Peide_Huang1;~Xilun_Zhang1;~Yongkang_Liu4;~Kentaro_Oguchi1;~Ding_Zhao1", "gender": "M;F;;M;;;", "homepage": "https://shiqiliu-67.github.io/;https://mxu34.github.io/;https://peidehuang.github.io/;https://github.com/XilunZhangRobo;;https://www.toyota.com;https://safeai-lab.github.io", "dblp": ";;295/8645;;;;", "google_scholar": "PiuAi5wAAAAJ;https://scholar.google.com/citations?hl=zh-CN;g5U-sjoAAAAJ;;https://scholar.google.com/citations?hl=en;;z7tPc9IAAAAJ", "orcid": ";0000-0001-9332-4175;;;;;", "linkedin": "shiqiliu2/;;peidehuang/;;;;", "or_profile": "~Shiqi_Liu2;~Mengdi_Xu3;~Peide_Huang1;~Xilun_Zhang1;~Yongkang_Liu4;~Kentaro_Oguchi1;~Ding_Zhao1", "aff": ";Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Toyota;Toyota Motor North America;Carnegie Mellon University", "aff_domain": ";cmu.edu;cmu.edu;cmu.edu;toyota.com;toyota.com;cmu.edu", "position": ";PhD student;PhD student;MS student;Principal Researcher;Director and Senior Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nliu2023continual,\ntitle={Continual Vision-based Reinforcement Learning with Group Symmetries},\nauthor={Shiqi Liu and Mengdi Xu and Peide Huang and Xilun Zhang and Yongkang Liu and Kentaro Oguchi and Ding Zhao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=flyQ0v8cgC}\n}", "github": "", "project": "", "reviewers": "JbJQ;iFGd;bj9h;WPhV", "site": "https://openreview.net/forum?id=flyQ0v8cgC", "pdf_size": 0, "rating": "10;10;10;10", "confidence": "4;4;4;4", "rating_avg": 10.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1149112194594470823&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;1;1;0", "aff_unique_norm": "Carnegie Mellon University;Toyota Motor Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.toyota.com", "aff_unique_abbr": "CMU;Toyota", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;Japan" }, { "id": "fvXFBCHVGn", "title": "Dynamic Multi-Team Racing: Competitive Driving on 1/10-th Scale Vehicles via Learning in Simulation", "track": "main", "status": "Poster", "tldr": "We learn dynamic 2-vs-2 multi-team racing via hierarchical policies through self-play reinforcement learning, and demonstrate that the competitive emergent behavior transfers to hardware.", "abstract": "Autonomous racing is a challenging task that requires vehicle handling at the dynamic limits of friction. While single-agent scenarios like Time Trials are solved competitively with classical model-based or model-free feedback control, multi-agent wheel-to-wheel racing poses several challenges including planning over unknown opponent intentions as well as negotiating interactions under dynamic constraints. We propose to address these challenges via a learning-based approach that effectively combines model-based techniques, massively parallel simulation, and self-play reinforcement learning to enable zero-shot sim-to-real transfer of highly dynamic policies. We deploy our algorithm in wheel-to-wheel multi-agent races on scale hardware to demonstrate the efficacy of our approach. Further details and videos can be found on the project website: https://sites.google.com/view/dynmutr/home.", "keywords": "Multi-Agent;Reinforcement Learning;Sim-to-Real Transfer;Autonomous Racing", "primary_area": "", "supplementary_material": "/attachment/8a8dfd7c1045f9da468ef85e0f4b1280f5e4f9ed.zip", "author": "Peter Werner;Tim Seyde;Paul Drews;Thomas Matrai Balch;Igor Gilitschenski;Wilko Schwarting;Guy Rosman;Sertac Karaman;Daniela Rus", "authorids": "~Peter_Werner1;~Tim_Seyde1;~Paul_Drews1;~Thomas_Matrai_Balch1;~Igor_Gilitschenski1;~Wilko_Schwarting1;~Guy_Rosman2;~Sertac_Karaman1;~Daniela_Rus1", "gender": ";;M;M;M;;M;M;F", "homepage": ";;https://scholar.google.com/citations?user=WTFllZEAAAAJ&hl=en;;https://www.gilitschenski.org/igor;;http://people.csail.mit.edu/rosman/index.html;https://karaman.mit.edu;https://www.csail.mit.edu/person/daniela-rus", "dblp": ";226/6408;;;129/1281;191/0268;53/3441;45/1718;r/DanielaRus", "google_scholar": "M5HFiMIAAAAJ;FJ7ILzkAAAAJ;WTFllZEAAAAJ;;Nuw1Y4oAAAAJ;;https://scholar.google.com/citations?hl=en;Vu-Zb7EAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;;", "linkedin": "wernerpe/;;;thomas-balch-53443a43/;igorgilitschenski/;;;;", "or_profile": "~Peter_Werner1;~Tim_Seyde1;~Paul_Drews1;~Thomas_Matrai_Balch1;~Igor_Gilitschenski1;~Wilko_Schwarting1;~Guy_Rosman2;~Sertac_Karaman1;~Daniela_Rus1", "aff": "Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;Massachusetts Institute of Technology;Toyota Research Institute;Toyota Research Institute;University of Toronto;;Toyota Research Institute;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "csail.mit.edu;mit.edu;tri.global;tri.global;toronto.edu;;tri.global;mit.edu;mit.edu", "position": "PhD student;Student;Researcher;Researcher;Assistant Professor;;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nwerner2023dynamic,\ntitle={Dynamic Multi-Team Racing: Competitive Driving on 1/10-th Scale Vehicles via Learning in Simulation},\nauthor={Peter Werner and Tim Seyde and Paul Drews and Thomas Matrai Balch and Igor Gilitschenski and Wilko Schwarting and Guy Rosman and Sertac Karaman and Daniela Rus},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=fvXFBCHVGn}\n}", "github": "", "project": "", "reviewers": "Edzc;sWPm;NBPG;x7zC", "site": "https://openreview.net/forum?id=fvXFBCHVGn", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;5;5", "rating_avg": 7.0, "confidence_avg": 4.5, "replies_avg": 13, "authors#_avg": 9, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6723485064512639292&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;1;2;1;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;Toyota Research Institute;University of Toronto", "aff_unique_dep": "Computer Science and Artificial Intelligence Laboratory;;", "aff_unique_url": "https://www.csail.mit.edu;https://www.tri.global;https://www.utoronto.ca", "aff_unique_abbr": "CSAIL;TRI;U of T", "aff_campus_unique_index": "0", "aff_campus_unique": "Cambridge;", "aff_country_unique_index": "0;0;0;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "fviZhMCr62", "title": "Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation", "track": "main", "status": "Poster", "tldr": "A composable framework for mobile robot navigation using modular blocks for natural language parsing, image understanding, grounding, and path planning in unknown environments.", "abstract": "Humans have the remarkable ability to navigate through unfamiliar environments by solely relying on our prior knowledge and descriptions of the environment. For robots to perform the same type of navigation, they need to be able to associate natural language descriptions with their associated physical environment with a limited amount of prior knowledge. Recently, Large Language Models (LLMs) have been able to reason over billions of parameters and utilize them in multi-modal chat-based natural language responses. However, LLMs lack real-world awareness and their outputs are not always predictable. In this work, we develop a low-bandwidth framework that solves this lack of real-world generalization by creating an intermediate layer between an LLM and a robot navigation framework in the form of Python code. Our intermediate shoehorns the vast prior knowledge inherent in an LLM model into a series of input and output API instructions that a mobile robot can understand. We evaluate our method across four different environments and command classes on a mobile robot and highlight our framework's ability to interpret contextual commands.", "keywords": "Natural language;navigation;contextual navigation", "primary_area": "", "supplementary_material": "/attachment/17cca06b8014e05c834e3c5859e5da1e35a1f2e3.zip", "author": "Harel Biggie;Ajay Narasimha Mopidevi;Dusty Woods;Chris Heckman", "authorids": "~Harel_Biggie1;~Ajay_Narasimha_Mopidevi1;destin.woods@colorado.edu;~Chris_Heckman1", "gender": "M;M;;M", "homepage": "http://harelbiggie.com;;;http://www.colorado.edu/cs/christoffer-heckman", "dblp": ";;;170/8568", "google_scholar": ";;;-YOtPcIAAAAJ", "orcid": ";;;", "linkedin": ";majaysimha/;;", "or_profile": "~Harel_Biggie1;~Ajay_Narasimha_Mopidevi1;destin.woods@colorado.edu;~Chris_Heckman1", "aff": "University of Colorado at Boulder;University of Colorado at Boulder;;University of Colorado at Boulder", "aff_domain": "colorado.edu;colorado.edu;;colorado.edu", "position": "PhD student;MS student;;Associate Professor", "bibtex": "@inproceedings{\nbiggie2023tell,\ntitle={Tell Me Where to Go: A Composable Framework for Context-Aware Embodied Robot Navigation},\nauthor={Harel Biggie and Ajay Narasimha Mopidevi and Dusty Woods and Chris Heckman},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=fviZhMCr62}\n}", "github": "https://github.com/arpg/navcon", "project": "", "reviewers": "DATK;9PgY;Gcpd;cxP7", "site": "https://openreview.net/forum?id=fviZhMCr62", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "5;4;3;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": -0.816496580927726, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9288458664218939216&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Colorado", "aff_unique_dep": "", "aff_unique_url": "https://www.colorado.edu", "aff_unique_abbr": "CU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Boulder", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gFXVysXh48K", "title": "Efficient Sim-to-real Transfer of Contact-Rich Manipulation Skills with Online Admittance Residual Learning", "track": "main", "status": "Poster", "tldr": "We propose an online admittance residual learning method to transfer the learned policy in simulation to the real world.", "abstract": "Learning contact-rich manipulation skills is essential. Such skills require the robots to interact with the environment with feasible manipulation trajectories and suitable compliance control parameters to enable safe and stable contact. However, learning these skills is challenging due to data inefficiency in the real world and the sim-to-real gap in simulation. In this paper, we introduce a hybrid offline-online framework to learn robust manipulation skills. We employ model-free reinforcement learning for the offline phase to obtain the robot motion and compliance control parameters in simulation \\RV{with domain randomization}. Subsequently, in the online phase, we learn the residual of the compliance control parameters to maximize robot performance-related criteria with force sensor measurements in real-time. To demonstrate the effectiveness and robustness of our approach, we provide comparative results against existing methods for assembly, pivoting, and screwing tasks.", "keywords": "Contact-rich Manipulation;Compliance Control", "primary_area": "", "supplementary_material": "/attachment/ed30045958aca9af1a7f7cc816cfa55391f57bcf.zip", "author": "Xiang Zhang;Changhao Wang;Lingfeng Sun;Zheng Wu;Xinghao Zhu;Masayoshi Tomizuka", "authorids": "~Xiang_Zhang20;~Changhao_Wang2;~Lingfeng_Sun1;~Zheng_Wu2;~Xinghao_Zhu1;~Masayoshi_Tomizuka1", "gender": "M;M;M;M;;M", "homepage": "https://xiang-zhang-98.github.io/;https://changhaowang.github.io;https://lingfeng.moe;https://zhengwu.us/;;https://me.berkeley.edu/people/masayoshi-tomizuka/", "dblp": ";;;;;10/4434", "google_scholar": ";;Uxb6wbkAAAAJ;Lx_cK2YAAAAJ;;", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Xiang_Zhang20;~Changhao_Wang2;~Lingfeng_Sun1;~Zheng_Wu2;~Xinghao_Zhu1;~Masayoshi_Tomizuka1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;;berkeley.edu", "position": "PhD student;PhD student;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nzhang2023efficient,\ntitle={Efficient Sim-to-real Transfer of Contact-Rich Manipulation Skills with Online Admittance Residual Learning},\nauthor={Xiang Zhang and Changhao Wang and Lingfeng Sun and Zheng Wu and Xinghao Zhu and Masayoshi Tomizuka},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=gFXVysXh48K}\n}", "github": "", "project": "", "reviewers": "zBMt;4kUn;4sjw;xCBP", "site": "https://openreview.net/forum?id=gFXVysXh48K", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;5;3;4", "rating_avg": 6.5, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3102062304127121246&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "gVBvtRqU1_", "title": "OVIR-3D: Open-Vocabulary 3D Instance Retrieval Without Training on 3D Data", "track": "main", "status": "Poster", "tldr": "This work presents OVIR-3D, a straightforward yet effective method for open-vocabulary 3D instance retrieval without 3D data for training.", "abstract": "This work presents OVIR-3D, a straightforward yet effective method for open-vocabulary 3D object instance retrieval without using any 3D data for training. Given a language query, the proposed method is able to return a ranked set of 3D object instance segments based on the feature similarity of the instance and the text query. This is achieved by a multi-view fusion of text-aligned 2D region proposals into 3D space, where the 2D region proposal network could leverage 2D datasets, which are more accessible and typically larger than 3D datasets. The proposed fusion process is efficient as it can be performed in real-time for most indoor 3D scenes and does not require additional training in 3D space. Experiments on public datasets and a real robot show the effectiveness of the method and its potential for applications in robot navigation and manipulation.", "keywords": "Open-Vocabulary;3D Instance Retrieval", "primary_area": "", "supplementary_material": "/attachment/a759a93ad1c47674996c304bef3cda5b67332011.zip", "author": "Shiyang Lu;Haonan Chang;Eric Pu Jing;Abdeslam Boularias;Kostas Bekris", "authorids": "~Shiyang_Lu2;~Haonan_Chang1;~Eric_Pu_Jing1;~Abdeslam_Boularias1;~Kostas_Bekris1", "gender": "M;M;M;M;M", "homepage": ";https://github.com/changhaonan;https://ericjing.com/;http://rl.cs.rutgers.edu/;https://pracsys.cs.rutgers.edu/members/kostas-bekris/", "dblp": ";;;57/2269;42/170", "google_scholar": "pDH5AcsAAAAJ;;;https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ;https://scholar.google.com.tw/citations?user=gwC7rCUAAAAJ", "orcid": ";;;;", "linkedin": "shiyang-lu-b204ab106/;;eric-jing-47308b12b/;;kostas-bekris-0b56794/", "or_profile": "~Shiyang_Lu2;~Haonan_Chang1;~Eric_Pu_Jing1;~Abdeslam_Boularias1;~Kostas_Bekris1", "aff": "Rutgers University - New Brunswick;Rutgers, New Brunswick;Rutgers University;, Rutgers University;Rutgers University", "aff_domain": "rutgers.edu;scarletmail.rutgers.edu;rutgers.edu;cs.rutgers.edu;rutgers.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nlu2023ovird,\ntitle={{OVIR}-3D: Open-Vocabulary 3D Instance Retrieval Without Training on 3D Data},\nauthor={Shiyang Lu and Haonan Chang and Eric Pu Jing and Abdeslam Boularias and Kostas Bekris},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=gVBvtRqU1_}\n}", "github": "https://github.com/shiyoung77/OVIR-3D/", "project": "", "reviewers": "Qx8o;dbTB;MRDv;dozd", "site": "https://openreview.net/forum?id=gVBvtRqU1_", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;5;4", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 61, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10247466356050745581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "0;0", "aff_campus_unique": "New Brunswick;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "g_PPHV_GkX", "title": "Hierarchical Planning for Rope Manipulation using Knot Theory and a Learned Inverse Model", "track": "main", "status": "Poster", "tldr": "Tying with Inverse model and search in topological space excluding demos", "abstract": "This work considers planning the manipulation of deformable 1-dimensional objects, such as ropes or cables, specifically to tie knots. We propose TWISTED: Tying With Inverse model and Search in Topological space Excluding Demos, a hierarchical planning approach which, at the high level, uses ideas from knot-theory to plan a sequence of rope configurations, while at the low level uses a neural-network inverse model to move between the configurations in the high-level plan. To train the neural network, we propose a self-supervised approach, where we learn from random movements of the rope. To focus the random movements on interesting configurations, such as knots, we propose a non-uniform sampling method tailored for this domain. In a simulation, we show that our approach can plan significantly faster and more accurately than baselines. We also show that our plans are robust to parameter changes in the physical simulation, suggesting future applications via sim2real.", "keywords": "Manipulation;Robot Learning and Planning", "primary_area": "", "supplementary_material": "/attachment/da6280d073cc7837da93fb6a3db19982f757a519.zip", "author": "Matan Sudry;Tom Jurgenson;Aviv Tamar;Erez Karpas", "authorids": "~Matan_Sudry1;~Tom_Jurgenson1;~Aviv_Tamar2;~Erez_Karpas1", "gender": ";;M;M", "homepage": ";;https://avivt.github.io/avivt/;https://karpase.net.technion.ac.il/", "dblp": ";https://dblp.uni-trier.de/pers/hd/j/Jurgenson:Tom;49/10622;https://dblp.org/pers/hd/k/Karpas:Erez", "google_scholar": ";1YjIvioAAAAJ;https://scholar.google.co.il/citations?user=kppa2vgAAAAJ;Dya6lhgAAAAJ", "orcid": ";;;0000-0002-9328-3657", "linkedin": "matan-sudry-187a41b0/;;;", "or_profile": "~Matan_Sudry1;~Tom_Jurgenson1;~Aviv_Tamar2;~Erez_Karpas1", "aff": "Technion, Technion;Technion;Technion, Technion;Technion - Israel Institute of Technology, Technion", "aff_domain": "technion.ac.il;technion.ac.il;technion.ac.il;technion.ac.il", "position": "PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nsudry2023hierarchical,\ntitle={Hierarchical Planning for Rope Manipulation using Knot Theory and a Learned Inverse Model},\nauthor={Matan Sudry and Tom Jurgenson and Aviv Tamar and Erez Karpas},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=g_PPHV_GkX}\n}", "github": "", "project": "", "reviewers": "o22m;6Lwg;fZRK;AJzJ", "site": "https://openreview.net/forum?id=g_PPHV_GkX", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;3;3", "rating_avg": 6.5, "confidence_avg": 3.5, "replies_avg": 11, "authors#_avg": 4, "corr_rating_confidence": -0.6882472016116854, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4321796549003505782&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Israel" }, { "id": "gdkKi_F55h", "title": "SA6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded Objects", "track": "main", "status": "Poster", "tldr": "", "abstract": "To enable meaningful robotic manipulation of objects in the real-world, 6D pose estimation is one of the critical aspects. Most existing approaches have difficulties to extend predictions to scenarios where novel object instances are continuously introduced, especially with heavy occlusions. In this work, we propose a few-shot pose estimation (FSPE) approach called SA6D, which uses a self-adaptive segmentation module to identify the novel target object and construct a point cloud model of the target object using only a small number of cluttered reference images. Unlike existing methods, SA6D does not require object-centric reference images or any additional object information, making it a more generalizable and scalable solution across categories. We evaluate SA6D on real-world tabletop object datasets and demonstrate that SA6D outperforms existing FSPE methods, particularly in cluttered scenes with occlusions, while requiring fewer reference images.", "keywords": "", "primary_area": "", "supplementary_material": "/attachment/a0e6be68580db58bab64dea1625ac5b09449d170.zip", "author": "Ning Gao;Vien Anh Ngo;Hanna Ziesche;Gerhard Neumann", "authorids": "~Ning_Gao2;~Vien_Anh_Ngo1;~Hanna_Ziesche1;~Gerhard_Neumann2", "gender": "M;M;M;F", "homepage": "https://gaobaoding.github.io/;https://vienngo.github.io;https://alr.anthropomatik.kit.edu/;", "dblp": ";87/439;60/4878;284/0793", "google_scholar": "https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=xk1gsM8AAAAJ;https://scholar.google.com.tw/citations?user=GL360kMAAAAJ;", "orcid": "0000-0001-7788-5677;;;0000-0003-2042-3660", "linkedin": "ning-gao-9245a0165/;;;", "or_profile": "~Ning_Gao2;~Vien_Anh_Ngo1;~Gerhard_Neumann1;~Hanna_Carolin_Maria_Ziesche1", "aff": "Robert Bosch GmbH, Bosch;Bosch Center for Artificial Intelligence;Karlsruhe Institute of Technology;Robert Bosch GmbH, Bosch", "aff_domain": "de.bosch.com;bosch.com;kit.edu;de.bosch.com", "position": "Researcher;Research Scientist;Full Professor;Research Scientist", "bibtex": "@inproceedings{\ngao2023sad,\ntitle={{SA}6D: Self-Adaptive Few-Shot 6D Pose Estimator for Novel and Occluded Objects},\nauthor={Ning Gao and Vien Anh Ngo and Hanna Ziesche and Gerhard Neumann},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=gdkKi_F55h}\n}", "github": "", "project": "", "reviewers": "pbhk;Zris;8PhQ;L1f1", "site": "https://openreview.net/forum?id=gdkKi_F55h", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;3;3;4", "rating_avg": 7.0, "confidence_avg": 3.5, "replies_avg": 22, "authors#_avg": 4, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4700410604600335286&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Robert Bosch GmbH;Bosch Center for Artificial Intelligence;Karlsruhe Institute of Technology", "aff_unique_dep": ";Center for Artificial Intelligence;", "aff_unique_url": "https://www.bosch.com;https://www.bosch-ai.com;https://www.kit.edu", "aff_unique_abbr": "Bosch;BCAI;KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "h-geaPzuJu", "title": "DROID: Learning from Offline Heterogeneous Demonstrations via Reward-Policy Distillation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Offline Learning from Demonstrations (OLfD) is valuable in domains where trial-and-error learning is infeasible or specifying a cost function is difficult, such as robotic surgery, autonomous driving, and path-finding for NASA's Mars rovers. However, two key problems remain challenging in OLfD: 1) heterogeneity: demonstration data can be generated with diverse preferences and strategies, and 2) generalizability: the learned policy and reward must perform well beyond a limited training regime in unseen test settings. To overcome these challenges, we propose Dual Reward and policy Offline Inverse Distillation (DROID), where the key idea is to leverage diversity to improve generalization performance by decomposing common-task and individual-specific strategies and distilling knowledge in both the reward and policy spaces. We ground DROID in a novel and uniquely challenging Mars rover path-planning problem for NASA's Mars Curiosity Rover. We also curate a novel dataset along 163 Sols (Martian days) and conduct a novel, empirical investigation to characterize heterogeneity in the dataset. We find DROID outperforms prior SOTA OLfD techniques, leading to a $26\\%$ improvement in modeling expert behaviors and $92\\%$ closer to the task objective of reaching the final destination. We also benchmark DROID on the OpenAI Gym Cartpole environment and find DROID achieves $55\\%$ (significantly) better performance modeling heterogeneous demonstrations.", "keywords": "Learning from Heterogeneous Demonstration;Network Distillation;Offline Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/14a6cb684edae6184aa62418341573bd1bbbfb89.zip", "author": "Sravan Jayanthi;Letian Chen;Nadya Balabanska;Van Duong;Erik Scarlatescu;Ezra Ameperosa;Zulfiqar Haider Zaidi;Daniel Martin;Taylor Keith Del Matto;Masahiro Ono;Matthew Gombolay", "authorids": "~Sravan_Jayanthi1;~Letian_Chen1;~Nadya_Balabanska1;van.a.duong@jpl.nasa.gov;~Erik_Scarlatescu1;~Ezra_Ameperosa1;~Zulfiqar_Haider_Zaidi1;~Daniel_Martin4;~Taylor_Keith_Del_Matto1;~Masahiro_Ono1;~Matthew_Gombolay1", "gender": ";M;;;M;;M;M;;M;M", "homepage": "https://github.com/SVJayanthi;http://letianchen.me/;https://www-robotics.jpl.nasa.gov/who-we-are/people/nadya-balabanska/;;https://erikscarlatescu.github.io;https://www.linkedin.com/in/ezra-ameperosa-6aa572a4;;https://www.linkedin.com/in/danielmartin576/;;;https://core-robotics.gatech.edu/", "dblp": ";232/1880;;;;;;;;70/753;144/1022", "google_scholar": ";SAeHYeQAAAAJ;;;;;;;;;Ihyz20wAAAAJ", "orcid": ";0000-0001-9238-7342;;;;;0000-0002-6053-6259;;;;", "linkedin": ";letianchen/;;;;;;;taylor-del-matto-2b0535aa/;;", "or_profile": "~Sravan_Jayanthi1;~Letian_Chen1;~Nadya_Balabanska1;van.a.duong@jpl.nasa.gov;~Erik_Scarlatescu1;~Ezra_Ameperosa1;~Zulfiqar_Haider_Zaidi1;~Daniel_Martin4;~Taylor_Keith_Del_Matto1;~Masahiro_Ono1;~Matthew_Gombolay1", "aff": "Georgia Institute of Technology;Toyota Research Institute;Jet Propulsion Laboratory;;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Jet Propulsion Laboratory;Georgia Institute of Technology", "aff_domain": "gatech.edu;tri.global;jpl.nasa.gov;;gatech.edu;gatech.edu;gatech.edu;gatech.edu;gatech.edu;jpl.nasa.gov;cc.gatech.edu", "position": "MS student;Research Intern;Researcher;;Undergrad student;PhD student;PhD student;Undergrad student;MS student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\njayanthi2023droid,\ntitle={{DROID}: Learning from Offline Heterogeneous Demonstrations via Reward-Policy Distillation},\nauthor={Sravan Jayanthi and Letian Chen and Nadya Balabanska and Van Duong and Erik Scarlatescu and Ezra Ameperosa and Zulfiqar Haider Zaidi and Daniel Martin and Taylor Keith Del Matto and Masahiro Ono and Matthew Gombolay},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=h-geaPzuJu}\n}", "github": "", "project": "", "reviewers": "FG7m;CWnj;CKuJ;tYgj", "site": "https://openreview.net/forum?id=h-geaPzuJu", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;3;3", "rating_avg": 5.5, "confidence_avg": 3.25, "replies_avg": 18, "authors#_avg": 11, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16185811701088696863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;0;0;0;0;2;0", "aff_unique_norm": "Georgia Institute of Technology;Toyota Research Institute;Jet Propulsion Laboratory", "aff_unique_dep": ";;", "aff_unique_url": "https://www.gatech.edu;https://www.tri.global;https://www.jpl.nasa.gov", "aff_unique_abbr": "Georgia Tech;TRI;JPL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "h8halpbqB-", "title": "Im2Contact: Vision-Based Contact Localization Without Touch or Force Sensing", "track": "main", "status": "Poster", "tldr": "", "abstract": "Contacts play a critical role in most manipulation tasks. Robots today mainly use proximal touch/force sensors to sense contacts, but the information they provide must be calibrated and is inherently local, with practical applications relying either on extensive surface coverage or restrictive assumptions to resolve ambiguities. We propose a vision-based extrinsic contact localization task: with only a single RGB-D camera view of a robot workspace, identify when and where an object held by the robot contacts the rest of the environment. We show that careful task-attuned design is critical for a neural network trained in simulation to discover solutions that transfer well to a real robot. Our final approach im2contact demonstrates the promise of versatile general-purpose contact perception from vision alone, performing well for localizing various contact types (point, line, or planar; sticking, sliding, or rolling; single or multiple), and even under occlusions in its camera view. Video results can be found at: https://sites.google.com/view/im2contact/home", "keywords": "contact perception;manipulation;vision-based", "primary_area": "", "supplementary_material": "/attachment/3636fa6219ed39256b7aae23e1a1b143b9de61e1.zip", "author": "Leon Kim;Yunshuang Li;Michael Posa;Dinesh Jayaraman", "authorids": "~Leon_Kim1;~Yunshuang_Li1;~Michael_Posa1;~Dinesh_Jayaraman2", "gender": ";F;M;M", "homepage": "https://www.grasp.upenn.edu/people/leon-kim/;https://li-yunshuang.github.io/;https://dair.seas.upenn.edu/;https://www.seas.upenn.edu/~dineshj/", "dblp": "257/4067;;129/2382;145/3870", "google_scholar": ";;DCSFMuAAAAAJ;QxLpghAAAAAJ", "orcid": ";;;0000-0002-6888-3095", "linkedin": ";;;dinesh-jayaraman-44b31539/", "or_profile": "~Leon_Kim1;~Yunshuang_Li1;~Michael_Posa1;~Dinesh_Jayaraman2", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "upenn.edu;upenn.edu;upenn.edu;upenn.edu", "position": "PhD student;MS student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nkim2023imcontact,\ntitle={Im2Contact: Vision-Based Contact Localization Without Touch or Force Sensing},\nauthor={Leon Kim and Yunshuang Li and Michael Posa and Dinesh Jayaraman},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=h8halpbqB-}\n}", "github": "", "project": "", "reviewers": "rgiT;25ZM;qKK8;PPMJ", "site": "https://openreview.net/forum?id=h8halpbqB-", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13064964303451003129&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "hRZ1YjDZmTo", "title": "MimicPlay: Long-Horizon Imitation Learning by Watching Human Play", "track": "main", "status": "Oral", "tldr": "We present MimicPlay, a novel imitation learning algorithm that leverages cost-effective human play data to learn long-horizon manipulation tasks in a sample-efficient manner.", "abstract": "Imitation learning from human demonstrations is a promising paradigm for teaching robots manipulation skills in the real world. However, learning complex long-horizon tasks often requires an unattainable amount of demonstrations. To reduce the high data requirement, we resort to human play data - video sequences of people freely interacting with the environment using their hands. Even with different morphologies, we hypothesize that human play data contain rich and salient information about physical interactions that can readily facilitate robot policy learning. Motivated by this, we introduce a hierarchical learning framework named MimicPlay that learns latent plans from human play data to guide low-level visuomotor control trained on a small number of teleoperated demonstrations. With systematic evaluations of 14 long-horizon manipulation tasks in the real world, we show that MimicPlay outperforms state-of-the-art imitation learning methods in task success rate, generalization ability, and robustness to disturbances. Code and videos are available at https://mimic-play.github.io.", "keywords": "Imitation Learning;Learning from Human;Long-Horizon Manipulation", "primary_area": "", "supplementary_material": "/attachment/c717e06b0bd3fa8e179b66d7a42d9a4ddcabeda6.zip", "author": "Chen Wang;Linxi Fan;Jiankai Sun;Ruohan Zhang;Li Fei-Fei;Danfei Xu;Yuke Zhu;Anima Anandkumar", "authorids": "~Chen_Wang16;~Linxi_Fan2;~Jiankai_Sun6;~Ruohan_Zhang1;~Li_Fei-Fei1;~Danfei_Xu1;~Yuke_Zhu1;~Anima_Anandkumar1", "gender": "M;;;M;F;M;M;F", "homepage": "http://www.chenwangjeremy.net/;;;https://ai.stanford.edu/~zharu/;https://profiles.stanford.edu/fei-fei-li;https://cs.stanford.edu/~danfei/;https://cs.utexas.edu/~yukez/;http://tensorlab.cms.caltech.edu/users/anima/", "dblp": ";154/6778;121/4211;;79/2528;135/8443;133/1772;", "google_scholar": "lStkAzsAAAAJ;sljtWIUAAAAJ;726MCb8AAAAJ;-bqvNWoAAAAJ;rDfyQnIAAAAJ;J5D4kcoAAAAJ;mWGyYMsAAAAJ;bEcLezcAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;fei-fei-li-4541247/;;;anima-anandkumar-35171b1/", "or_profile": "~Chen_Wang16;~Linxi_Fan2;~Jiankai_Sun6;~Ruohan_Zhang1;~Li_Fei-Fei1;~Danfei_Xu1;~Yuke_Zhu1;~anima_anandkumar1", "aff": "Computer Science Department, Stanford University;NVIDIA;Stanford University;Stanford University;Stanford University;NVIDIA;Computer Science Department, University of Texas, Austin;California Institute of Technology", "aff_domain": "cs.stanford.edu;nvidia.com;stanford.edu;stanford.edu;stanford.edu;nvidia.com;cs.utexas.edu;caltech.edu", "position": "PhD student;Researcher;PhD student;Postdoc;Full Professor;Research Scientist;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2023mimicplay,\ntitle={MimicPlay: Long-Horizon Imitation Learning by Watching Human Play},\nauthor={Chen Wang and Linxi Fan and Jiankai Sun and Ruohan Zhang and Li Fei-Fei and Danfei Xu and Yuke Zhu and Anima Anandkumar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=hRZ1YjDZmTo}\n}", "github": "https://github.com/j96w/MimicPlay", "project": "", "reviewers": "17ab;5FXC;4uup;mxMa", "site": "https://openreview.net/forum?id=hRZ1YjDZmTo", "pdf_size": 0, "rating": "6;10;10;10", "confidence": "4;5;3;4", "rating_avg": 9.0, "confidence_avg": 4.0, "replies_avg": 11, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 187, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1485976264239343123&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;0;1;2;3", "aff_unique_norm": "Stanford University;NVIDIA;University of Texas at Austin;California Institute of Technology", "aff_unique_dep": "Computer Science Department;NVIDIA Corporation;Computer Science Department;", "aff_unique_url": "https://www.stanford.edu;https://www.nvidia.com;https://www.utexas.edu;https://www.caltech.edu", "aff_unique_abbr": "Stanford;NVIDIA;UT Austin;Caltech", "aff_campus_unique_index": "0;0;0;0;2;3", "aff_campus_unique": "Stanford;;Austin;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "i84V7i6KEMd", "title": "Sample-Efficient Preference-based Reinforcement Learning with Dynamics Aware Rewards", "track": "main", "status": "Poster", "tldr": "We provide state-action transition dynamics into a reward function learnt from trajectory preferences and find that we can obtain baseline performance with one order of magnitude fewer preferences.", "abstract": "Preference-based reinforcement learning (PbRL) aligns a robot behavior with human preferences via a reward function learned from binary feedback over agent behaviors. We show that encoding environment dynamics in the reward function improves the sample efficiency of PbRL by an order of magnitude. In our experiments we iterate between: (1) encoding environment dynamics in a state-action representation $z^{sa}$ via a self-supervised temporal consistency task, and (2) bootstrapping the preference-based reward function from $z^{sa}$, which results in faster policy learning and better final policy performance. For example, on quadruped-walk, walker-walk, and cheetah-run, with 50 preference labels we achieve the same performance as existing approaches with 500 preference labels, and we recover 83% and 66% of ground truth reward policy performance versus only 38% and 21% without environment dynamics. The performance gains demonstrate that _explicitly encoding environment dynamics improves preference-learned reward functions_.", "keywords": "human-in-the-loop learning;preference-based RL;RLHF", "primary_area": "", "supplementary_material": "", "author": "Katherine Metcalf;Miguel Sarabia;Natalie Mackraz;Barry-John Theobald", "authorids": "~Katherine_Metcalf1;~Miguel_Sarabia1;~Natalie_Mackraz1;~Barry-John_Theobald1", "gender": ";;F;M", "homepage": ";;;", "dblp": ";;359/5984;86/6624", "google_scholar": ";;;DNrQd3IAAAAJ", "orcid": ";;;", "linkedin": ";;natalie-mackraz/;barry-john-theobald-392a0611a/", "or_profile": "~Katherine_Metcalf1;~Miguel_Sarabia1;~Natalie_Mackraz1;~Barry-John_Theobald1", "aff": ";;Apple;Apple", "aff_domain": ";;apple.com;apple.com", "position": ";;Researcher;Researcher", "bibtex": "@inproceedings{\nmetcalf2023sampleefficient,\ntitle={Sample-Efficient Preference-based Reinforcement Learning with Dynamics Aware Rewards},\nauthor={Katherine Metcalf and Miguel Sarabia and Natalie Mackraz and Barry-John Theobald},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=i84V7i6KEMd}\n}", "github": "", "project": "", "reviewers": "D6aA;zDek;XueR;8gBh", "site": "https://openreview.net/forum?id=i84V7i6KEMd", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;4;4;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 9, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13873133854096374770&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Apple", "aff_unique_dep": "Apple Inc.", "aff_unique_url": "https://www.apple.com", "aff_unique_abbr": "Apple", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "ihqTtzS83VS", "title": "Learning Reusable Manipulation Strategies", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humans demonstrate an impressive ability to acquire and generalize manipulation \"tricks.\" Even from a single demonstration, such as using soup ladles to reach for distant objects, we can apply this skill to new scenarios involving different object positions, sizes, and categories (e.g., forks and hammers). Additionally, we can flexibly combine various skills to devise long-term plans. In this paper, we present a framework that enables machines to acquire such manipulation skills, referred to as \"mechanisms,'' through a single demonstration and self-play. Our key insight lies in interpreting each demonstration as a sequence of changes in robot-object and object-object contact modes, which provides a scaffold for learning detailed samplers for continuous parameters. These learned mechanisms and samplers can be seamlessly integrated into standard task and motion planners, enabling their compositional use.", "keywords": "Contact Modeling and Manipulation;Task and Motion Planning", "primary_area": "", "supplementary_material": "/attachment/bb40b6eeaabae785a4ff90a45b741065bd8163eb.zip", "author": "Jiayuan Mao;Tom\u00e1s Lozano-P\u00e9rez;Joshua B. Tenenbaum;Leslie Pack Kaelbling", "authorids": "~Jiayuan_Mao1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1", "gender": "F;M;;F", "homepage": "http://jiayuanm.com;http://people.csail.mit.edu/tlp/;;http://people.csail.mit.edu/lpk/", "dblp": "200/8283;90/752;t/JoshuaBTenenbaum;k/LesliePackKaelbling", "google_scholar": "-xaOIZIAAAAJ;gQOKAggAAAAJ;;IcasIiwAAAAJ", "orcid": "0000-0003-4798-3748;;;0000-0001-6054-7145", "linkedin": ";;;", "or_profile": "~Jiayuan_Mao1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Joshua_B._Tenenbaum1;~Leslie_Pack_Kaelbling1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;Full Professor;Professor;Full Professor", "bibtex": "@inproceedings{\nmao2023learning,\ntitle={Learning Reusable Manipulation Strategies},\nauthor={Jiayuan Mao and Tom{\\'a}s Lozano-P{\\'e}rez and Joshua B. Tenenbaum and Leslie Pack Kaelbling},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ihqTtzS83VS}\n}", "github": "", "project": "", "reviewers": "W6qd;5d1G;KXeh;sZoG", "site": "https://openreview.net/forum?id=ihqTtzS83VS", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "2;3;3;3", "rating_avg": 6.5, "confidence_avg": 2.75, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": 0.6622661785325219, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4335192760389203843&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "j2AQ-WJ_ze", "title": "Language-guided Robot Grasping: CLIP-based Referring Grasp Synthesis in Clutter", "track": "main", "status": "Poster", "tldr": "A new benchmark and an end-to-end model for language-guided 4-DoF grasp synthesis in cluttered tabletop scenes.", "abstract": "Robots operating in human-centric environments require the integration of visual grounding and grasping capabilities to effectively manipulate objects based on user instructions. This work focuses on the task of referring grasp synthesis, which predicts a grasp pose for an object referred through natural language in cluttered scenes. Existing approaches often employ multi-stage pipelines that first segment the referred object and then propose a suitable grasp, and are evaluated in private datasets or simulators that do not capture the complexity of natural indoor scenes. To address these limitations, we develop a challenging benchmark based on cluttered indoor scenes from OCID dataset, for which we generate referring expressions and connect them with 4-DoF grasp poses. Further, we propose a novel end-to-end model (CROG) that leverages the visual grounding capabilities of CLIP to learn grasp synthesis directly from image-text pairs. Our results show that vanilla integration of CLIP with pretrained models transfers poorly in our challenging benchmark, while CROG achieves significant improvements both in terms of grounding and grasping. Extensive robot experiments in both simulation and hardware demonstrate the effectiveness of our approach in challenging interactive object grasping scenarios that include clutter.", "keywords": "Language-Guided Robot Grasping;Referring Grasp Synthesis;Visual Grounding", "primary_area": "", "supplementary_material": "/attachment/b08addecca69c2a2ab6cd8c4f62b9c4b24205532.zip", "author": "Georgios Tziafas;Yucheng XU;Arushi Goel;Mohammadreza Kasaei;Zhibin Li;Hamidreza Kasaei", "authorids": "~Georgios_Tziafas1;~Yucheng_XU1;~Arushi_Goel2;~Mohammadreza_Kasaei1;~Zhibin_Li2;~Hamidreza_Kasaei1", "gender": "M;M;F;M;M;M", "homepage": ";;https://goelarushi.github.io/;https://mohammadkasaei.github.io/Mohammadreza-Kasaei/;https://www.research.ed.ac.uk/en/persons/zhibin-alex-li/;https://www.ai.rug.nl/hkasaei", "dblp": ";;;;;", "google_scholar": ";;tj08PZcAAAAJ;2aY06V4AAAAJ;;VFr_XuYAAAAJ", "orcid": ";0000-0001-9023-0974;;;;", "linkedin": ";;;;;hamidreza-kasaei-49b83b57/", "or_profile": "~Georgios_Tziafas1;~Yucheng_XU1;~Arushi_Goel2;~Mohammadreza_Kasaei1;~Zhibin_Li2;~Hamidreza_Kasaei1", "aff": "University of Groningen;University of Edinburgh, University of Edinburgh;University of Edinburgh;University of Edinburgh, University of Edinburgh;University College London, University of London;University of Groningen", "aff_domain": "rug.nl;ed.ac.uk;ed.ac.uk;ed.ac.uk;ucl.ac.uk;rug.nl", "position": "PhD student;PhD student;PhD student;Postdoc;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ntziafas2023languageguided,\ntitle={Language-guided Robot Grasping: {CLIP}-based Referring Grasp Synthesis in Clutter},\nauthor={Georgios Tziafas and Yucheng XU and Arushi Goel and Mohammadreza Kasaei and Zhibin Li and Hamidreza Kasaei},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=j2AQ-WJ_ze}\n}", "github": "https://github.com/gtziafas/OCID-VLG", "project": "", "reviewers": "xdNw;upRx;ZgiR", "site": "https://openreview.net/forum?id=j2AQ-WJ_ze", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;4;5", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 1.0, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5463553575200466058&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 10, "aff_unique_index": "0;1;1;1;2;0", "aff_unique_norm": "University of Groningen;University of Edinburgh;University College London", "aff_unique_dep": ";;", "aff_unique_url": "https://www.rug.nl;https://www.ed.ac.uk;https://www.ucl.ac.uk", "aff_unique_abbr": "RUG;Edinburgh;UCL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "Netherlands;United Kingdom" }, { "id": "k-Fg8JDQmc", "title": "Language Embedded Radiance Fields for Zero-Shot Task-Oriented Grasping", "track": "main", "status": "Oral", "tldr": "Task-oriented grasping with vision-language models in 3D using language embedded radiance fields", "abstract": "Grasping objects by a specific subpart is often crucial for safety and for executing downstream tasks. We propose LERF-TOGO, Language Embedded Radiance Fields for Task-Oriented Grasping of Objects, which uses vision-language models zero-shot to output a grasp distribution over an object given a natural language query. To accomplish this, we first construct a LERF of the scene, which distills CLIP embeddings into a multi-scale 3D language field queryable with text. However, LERF has no sense of object boundaries, so its relevancy outputs often return incomplete activations over an object which are insufficient for grasping. LERF-TOGO mitigates this lack of spatial grouping by extracting a 3D object mask via DINO features and then conditionally querying LERF on this mask to obtain a semantic distribution over the object to rank grasps from an off-the-shelf grasp planner. We evaluate LERF-TOGO\u2019s ability to grasp task-oriented object parts on 31 physical objects, and find it selects grasps on the correct part in 81% of trials and grasps successfully in 69%. Code, data, appendix, and details are available at: lerftogo.github.io", "keywords": "NeRF;Natural Language;Grasping;Semantics", "primary_area": "", "supplementary_material": "/attachment/b88e2e54a90fa7e4e412b92966eaf8b6f4f97084.zip", "author": "Adam Rashid;Satvik Sharma;Chung Min Kim;Justin Kerr;Lawrence Yunliang Chen;Angjoo Kanazawa;Ken Goldberg", "authorids": "~Adam_Rashid1;~Satvik_Sharma1;~Chung_Min_Kim1;~Justin_Kerr1;~Lawrence_Yunliang_Chen1;~Angjoo_Kanazawa1;~Ken_Goldberg1", "gender": "M;M;;M;M;F;M", "homepage": ";;https://chungmin99.github.io/;https://kerrj.github.io/;https://yunliangchen.github.io/;https://people.eecs.berkeley.edu/~kanazawa/;http://goldberg.berkeley.edu/", "dblp": ";;305/3515;;;119/1305;g/KennethYGoldberg", "google_scholar": ";0wZN6hEAAAAJ;ODr5lMgAAAAJ;;;Ci-_QYIAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ", "orcid": ";;;;;;0000-0001-6747-9499", "linkedin": "adam-rashid-83a94a1b6/;;;;lawrence-yunliang-chen/;;goldbergken/", "or_profile": "~Adam_Rashid1;~Satvik_Sharma1;~Chung_Min_Kim1;~Justin_Kerr1;~Lawrence_Yunliang_Chen1;~Angjoo_Kanazawa1;~Ken_Goldberg1", "aff": ";University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": ";berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": ";Undergrad student;PhD student;PhD student;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nrashid2023language,\ntitle={Language Embedded Radiance Fields for Zero-Shot Task-Oriented Grasping},\nauthor={Adam Rashid and Satvik Sharma and Chung Min Kim and Justin Kerr and Lawrence Yunliang Chen and Angjoo Kanazawa and Ken Goldberg},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=k-Fg8JDQmc}\n}", "github": "", "project": "", "reviewers": "B1ts;aACe;5Sor;bAJo", "site": "https://openreview.net/forum?id=k-Fg8JDQmc", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "4;4;4;3", "rating_avg": 7.5, "confidence_avg": 3.75, "replies_avg": 8, "authors#_avg": 7, "corr_rating_confidence": -0.5555555555555555, "gs_citation": 88, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10160617997986232830&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of California, Berkeley", "aff_unique_dep": "", "aff_unique_url": "https://www.berkeley.edu", "aff_unique_abbr": "UC Berkeley", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kOm3jWX8YN", "title": "Learning to Discern: Imitating Heterogeneous Human Demonstrations with Preference and Representation Learning", "track": "main", "status": "Poster", "tldr": "L2D, a new IL framework, enhances policy performance by learning from varied demonstrations, utilizing latent trajectory representations to discern and prioritize high-quality training data in both simulated and robot tasks", "abstract": "Practical Imitation Learning (IL) systems rely on large human demonstration datasets for successful policy learning. However, challenges lie in maintaining the quality of collected data and addressing the suboptimal nature of some demonstrations, which can compromise the overall dataset quality and hence the learning outcome. Furthermore, the intrinsic heterogeneity in human behavior can produce equally successful but disparate demonstrations, further exacerbating the challenge of discerning demonstration quality. To address these challenges, this paper introduces Learning to Discern (L2D), an offline imitation learning framework for learning from demonstrations with diverse quality and style. Given a small batch of demonstrations with sparse quality labels, we learn a latent representation for temporally embedded trajectory segments. Preference learning in this latent space trains a quality evaluator that generalizes to new demonstrators exhibiting different styles. Empirically, we show that L2D can effectively assess and learn from varying demonstrations, thereby leading to improved policy performance across a range of tasks in both simulations and on a physical robot.", "keywords": "Imitation Learning;Preference Learning;Manipulation", "primary_area": "", "supplementary_material": "", "author": "Sachit Kuhar;Shuo Cheng;Shivang Chopra;Matthew Bronars;Danfei Xu", "authorids": "~Sachit_Kuhar1;~Shuo_Cheng1;~Shivang_Chopra1;~Matthew_Bronars1;~Danfei_Xu1", "gender": "M;M;M;M;M", "homepage": "https://sachitkuhar.github.io/;https://sites.google.com/view/shuocheng/home;https://shivangchopra11.github.io/;https://bronars.github.io/;https://cs.stanford.edu/~danfei/", "dblp": "240/0924;179/0863;262/6038;;135/8443", "google_scholar": "X8slYZEAAAAJ;5CL_0qMAAAAJ;8zokdYsAAAAJ;;J5D4kcoAAAAJ", "orcid": "0000-0002-5739-013X;;0000-0002-3567-852X;;", "linkedin": ";;https://linkedin.com/in/shivangchopra11;;", "or_profile": "~Sachit_Kuhar1;~Shuo_Cheng1;~Shivang_Chopra1;~Matthew_Bronars1;~Danfei_Xu1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;NVIDIA", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu;nvidia.com", "position": "MS student;PhD student;MS student;MS student;Research Scientist", "bibtex": "@inproceedings{\nkuhar2023learning,\ntitle={Learning to Discern: Imitating Heterogeneous Human Demonstrations with Preference and Representation Learning},\nauthor={Sachit Kuhar and Shuo Cheng and Shivang Chopra and Matthew Bronars and Danfei Xu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=kOm3jWX8YN}\n}", "github": "", "project": "", "reviewers": "L98X;xRQR;pGpd;kfod", "site": "https://openreview.net/forum?id=kOm3jWX8YN", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;4;3", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 10, "authors#_avg": 5, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=826165896476530176&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kSXh83gWWy", "title": "Context-Aware Deep Reinforcement Learning for Autonomous Robotic Navigation in Unknown Area", "track": "main", "status": "Poster", "tldr": "We propose a learning-based DRL framework to achieve fast and efficient mapless navigation.", "abstract": "Mapless navigation refers to a challenging task where a mobile robot must rapidly navigate to a predefined destination using its partial knowledge of the environment, which is updated online along the way, instead of a prior map of the environment. Inspired by the recent developments in deep reinforcement learning (DRL), we propose a learning-based framework for mapless navigation, which employs a context-aware policy network to achieve efficient decision-making (i.e., maximize the likelihood of finding the shortest route towards the target destination), especially in complex and large-scale environments. Specifically, our robot learns to form a context of its belief over the entire known area, which it uses to reason about long-term efficiency and sequence show-term movements. Additionally, we propose a graph rarefaction algorithm to enable more efficient decision-making in large-scale applications. We empirically demonstrate that our approach reduces average travel time by up to $61.4\\%$ and average planning time by up to $88.2\\%$ compared to benchmark planners (D*lite and BIT) on hundreds of test scenarios. We also validate our approach both in high-fidelity Gazebo simulations as well as on hardware, highlighting its promising applicability in the real world without further training/tuning.", "keywords": "deep reinforcement learning;mapless navigation;context-aware decision-making", "primary_area": "", "supplementary_material": "/attachment/3f5db36f5681b755bd8dd5bd6db61f53d59305a9.zip", "author": "Jingsong Liang;Zhichen Wang;Yuhong Cao;Jimmy Chiun;Mengqi Zhang;Guillaume Adrien Sartoretti", "authorids": "~Jingsong_Liang1;~Zhichen_Wang1;~Yuhong_Cao1;~Jimmy_Chiun1;~Mengqi_Zhang5;~Guillaume_Adrien_Sartoretti1", "gender": "M;M;M;M;M;M", "homepage": "https://jingsongliang.com/;https://github.com/bobyjaywzc;;;https://cde.nus.edu.sg/me/staff/zhang-mengqi;https://marmotlab.org/", "dblp": "368/8380;;;;;118/9066", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;n7NzZ0sAAAAJ", "orcid": ";;0000-0001-8099-0689;0009-0009-5184-8291;;0000-0002-7579-9916", "linkedin": "jingsongliang/;;;jimmychiun;;", "or_profile": "~Jingsong_Liang1;~Zhichen_Wang1;~Yuhong_Cao1;~Jimmy_Chiun1;~Mengqi_Zhang5;~Guillaume_Adrien_Sartoretti1", "aff": "National University of Singapore, NUS;National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore;National University of Singapore", "aff_domain": "u.nus.edu;nus.edu;u.nus.edu;u.nus.edu;nus.edu;nus.edu.sg", "position": "MS student;MS student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliang2023contextaware,\ntitle={Context-Aware Deep Reinforcement Learning for Autonomous Robotic Navigation in Unknown Area},\nauthor={Jingsong Liang and Zhichen Wang and Yuhong Cao and Jimmy Chiun and Mengqi Zhang and Guillaume Adrien Sartoretti},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=kSXh83gWWy}\n}", "github": "https://github.com/marmotlab/Context_Aware_Navigation", "project": "", "reviewers": "CUZW;cs7G;eMFY;qPjV", "site": "https://openreview.net/forum?id=kSXh83gWWy", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;3;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 11, "authors#_avg": 6, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4114336466098130435&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "National University of Singapore", "aff_unique_dep": "", "aff_unique_url": "https://www.nus.edu.sg", "aff_unique_abbr": "NUS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Singapore" }, { "id": "keAPCON4jHC", "title": "Robust Reinforcement Learning in Continuous Control Tasks with Uncertainty Set Regularization", "track": "main", "status": "Poster", "tldr": "", "abstract": "Reinforcement learning (RL) is recognized as lacking generalization and robustness under environmental perturbations, which excessively restricts its application for real-world robotics. Prior work claimed that adding regularization to the value function is equivalent to learning a robust policy under uncertain transitions. Although the regularization-robustness transformation is appealing for its simplicity and efficiency, it is still lacking in continuous control tasks. In this paper, we propose a new regularizer named $\\textbf{U}$ncertainty $\\textbf{S}$et $\\textbf{R}$egularizer (USR), to formulate the uncertainty set on the parametric space of a transition function. To deal with unknown uncertainty sets, we further propose a novel adversarial approach to generate them based on the value function. We evaluate USR on the Real-world Reinforcement Learning (RWRL) benchmark and the Unitree A1 Robot, demonstrating improvements in the robust performance of perturbed testing environments and sim-to-real scenarios.", "keywords": "Reinforcement Learning;Robustness;Continuous Control;Robotics", "primary_area": "", "supplementary_material": "/attachment/79a8580c16a07b4630e6ad2aa59c3649c9ca4bbc.zip", "author": "Yuan Zhang;Jianhong Wang;Joschka Boedecker", "authorids": "~Yuan_Zhang8;~Jianhong_Wang1;~Joschka_Boedecker1", "gender": ";M;M", "homepage": ";https://hsvgbkhgbv.github.io/;https://nr.informatik.uni-freiburg.de", "dblp": ";;84/5457", "google_scholar": "gMzGCV0AAAAJ;K1FKF3IAAAAJ;https://scholar.google.de/citations?user=2mv2dDkAAAAJ", "orcid": ";;", "linkedin": ";jianhong-wang-45995b100/;", "or_profile": "~Yuan_Zhang8;~Jianhong_Wang1;~Joschka_B\u00f6decker1", "aff": "University of Freiburg;Imperial College London;Universit\u00e4t Freiburg", "aff_domain": "uni-freiburg.de;ic.ac.uk;uni-freiburg.de", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023robust,\ntitle={Robust Reinforcement Learning in Continuous Control Tasks with Uncertainty Set Regularization},\nauthor={Yuan Zhang and Jianhong Wang and Joschka Boedecker},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=keAPCON4jHC}\n}", "github": "github.com/mikezhang95/rrl_usr", "project": "", "reviewers": "5r3v;2mmx;8DAc;Fg3n", "site": "https://openreview.net/forum?id=keAPCON4jHC", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5174421115796742111&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of Freiburg;Imperial College London", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.imperial.ac.uk", "aff_unique_abbr": "UoF;ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Germany;United Kingdom" }, { "id": "low-53sFqn", "title": "Fleet Active Learning: A Submodular Maximization Approach", "track": "main", "status": "Poster", "tldr": "This work introduces a fleet active learning framework for multi-robot systems.", "abstract": "In multi-robot systems, robots often gather data to improve the performance of their deep neural networks (DNNs) for perception and planning. Ideally, these robots should select the most informative samples from their local data distributions by employing active learning approaches. However, when the data collection is distributed among multiple robots, redundancy becomes an issue as different robots may select similar data points. To overcome this challenge, we propose a fleet active learning (FAL) framework in which robots collectively select informative data samples to enhance their DNN models. Our framework leverages submodular maximization techniques to prioritize the selection of samples with high information gain. Through an iterative algorithm, the robots coordinate their efforts to collectively select the most valuable samples while minimizing communication between robots. We provide a theoretical analysis of the performance of our proposed framework and show that it is able to approximate the NP-hard optimal solution. We demonstrate the effectiveness of our framework through experiments on real-world perception and classification datasets, which include autonomous driving datasets such as Berkeley DeepDrive. Our results show an improvement by up to $25.0 \\%$ in classification accuracy, $9.2 \\%$ in mean average precision and $48.5 \\%$ in the submodular objective value compared to a completely distributed baseline.", "keywords": "Active Learning;Cloud Robotics;Robotic Perception", "primary_area": "", "supplementary_material": "/attachment/c30ff8ffca84dda00758b6c351e2906c3c26624b.zip", "author": "Oguzhan Akcin;Orhan Unuvar;Onat Ure;Sandeep P. Chinchali", "authorids": "~Oguzhan_Akcin2;~Orhan_Unuvar1;~Onat_Ure1;~Sandeep_P._Chinchali1", "gender": "M;M;M;", "homepage": ";;;https://www.ece.utexas.edu/people/faculty/sandeep-chinchali", "dblp": "311/3023;;;85/8366", "google_scholar": "2elIEXoAAAAJ;;;262ASa4AAAAJ", "orcid": ";;;", "linkedin": "oguzhan-akcin-0907/;orhan-unuvar/;onat-%C3%BCre-8231aa196/;", "or_profile": "~Oguzhan_Akcin2;~Orhan_Unuvar1;~Onat_Ure1;~Sandeep_Chinchali1", "aff": "The University of Texas at Austin;University of Texas at Austin;University of Texas at Austin;University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;utexas.edu", "position": "PhD student;Intern;Intern;Assistant Professor", "bibtex": "@inproceedings{\nakcin2023fleet,\ntitle={Fleet Active Learning: A Submodular Maximization Approach},\nauthor={Oguzhan Akcin and Orhan Unuvar and Onat Ure and Sandeep P. Chinchali},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=low-53sFqn}\n}", "github": "https://github.com/UTAustin-SwarmLab/Fleet-Active-Learning.git", "project": "", "reviewers": "25hL;fsBm;xJp2", "site": "https://openreview.net/forum?id=low-53sFqn", "pdf_size": 0, "rating": "6;6;6", "confidence": "2;3;4", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13007456691046863412&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "mTZcxs2O7k", "title": "Batch Differentiable Pose Refinement for In-The-Wild Camera/LiDAR Extrinsic Calibration", "track": "main", "status": "Poster", "tldr": "Differentiable pose refinement in batch achieves state-of-the-art zero-shot transfer for camera/LiDAR extrinsic calibration", "abstract": "Accurate camera to LiDAR (Light Detection and Ranging) extrinsic calibration is important for robotic tasks carrying out tight sensor fusion --- such as target tracking and odometry. Calibration is typically performed before deployment in controlled conditions using calibration targets, however, this limits scalability and subsequent recalibration. We propose a novel approach for target-free camera-LiDAR calibration using end-to-end direct alignment which doesn't need calibration targets. Our batched formulation enhances sample efficiency during training and robustness at inference time. We present experimental results, on publicly available real-world data, demonstrating 1.6cm/0.07\u00b0 median accuracy when transferred to unseen sensors from held-out data sequences. We also show state-of-the-art zero-shot transfer to unseen cameras, LiDARs, and environments.", "keywords": "Sensor Fusion;Extrinsic Calibration;Differentiable Optimization", "primary_area": "", "supplementary_material": "/attachment/5afded998bc5b329d04081b82a3214c0f7ea6b82.zip", "author": "Lanke Frank Tarimo Fu;Maurice Fallon", "authorids": "~Lanke_Frank_Tarimo_Fu1;~Maurice_Fallon1", "gender": "M;M", "homepage": "https://github.com/fulkast;https://ori.ox.ac.uk/ori-people/maurice-fallon/", "dblp": ";68/7394.html", "google_scholar": ";https://scholar.google.co.uk/citations?user=BqV8LaoAAAAJ", "orcid": ";0000-0003-2940-0879", "linkedin": ";", "or_profile": "~Lanke_Frank_Tarimo_Fu1;~Maurice_Fallon1", "aff": "University of Oxford;University of Oxford", "aff_domain": "robots.ox.ac.uk;ox.ac.uk", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nfu2023batch,\ntitle={Batch Differentiable Pose Refinement for In-The-Wild Camera/Li{DAR} Extrinsic Calibration},\nauthor={Lanke Frank Tarimo Fu and Maurice Fallon},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=mTZcxs2O7k}\n}", "github": "", "project": "", "reviewers": "sEmr;3776;StF3", "site": "https://openreview.net/forum?id=mTZcxs2O7k", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 9, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6963003128051527253&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "n9lew97SAn", "title": "Action-Quantized Offline Reinforcement Learning for Robotic Skill Learning", "track": "main", "status": "Poster", "tldr": "A state-conditioned action discretization method to improve various continuous offline RL methods", "abstract": "The offline reinforcement learning (RL) paradigm provides a general recipe to convert static behavior datasets into policies that can perform better than the policy that collected the data. While policy constraints, conservatism, and other methods for mitigating distributional shifts have made offline reinforcement learning more effective, the continuous action setting often necessitates various approximations for applying these techniques. Many of these challenges are greatly alleviated in discrete action settings, where offline RL constraints and regularizers can often be computed more precisely or even exactly. In this paper, we propose an adaptive scheme for action quantization. We use a VQ-VAE to learn state- conditioned action quantization, avoiding the exponential blowup that comes with na\u00efve discretization of the action space. We show that several state-of-the-art offline RL methods such as IQL, CQL, and BRAC improve in performance on benchmarks when combined with our proposed discretization scheme. We further validate our approach on a set of challenging long-horizon complex robotic manipulation tasks in the Robomimic environment, where our discretized offline RL algorithms are able to improve upon their continuous counterparts by 2-3x. Our project page is at saqrl.github.io", "keywords": "Offline Reinforcement Learning;Discretization;Robot Skill Learning", "primary_area": "", "supplementary_material": "/attachment/d98aa7ffb4a99dea0faa24fb74f6a501d08e8646.zip", "author": "Jianlan Luo;Perry Dong;Jeffrey Wu;Aviral Kumar;Xinyang Geng;Sergey Levine", "authorids": "~Jianlan_Luo1;~Perry_Dong1;~Jeffrey_Wu2;~Aviral_Kumar2;~Xinyang_Geng1;~Sergey_Levine1", "gender": ";;M;M;M;M", "homepage": "https://people.eecs.berkeley.edu/~jianlanluo/;;https://jeffreywu13579.github.io/;https://aviralkumar2907.github.io/;http://young-geng.xyz/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "161/1838;;;202/7961;186/8221;80/7594", "google_scholar": "SJoRNbYAAAAJ;;jH_7A6gAAAAJ;;vYougn0AAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;", "linkedin": ";;jeffrey-wu-721977187/;;;", "or_profile": "~Jianlan_Luo1;~Perry_Dong1;~Jeffrey_Wu2;~Aviral_Kumar2;~Xinyang_Geng1;~Sergey_Levine1", "aff": "Google;;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "google.com;;berkeley.edu;berkeley.edu;berkeley.edu;google.com", "position": "Researcher;;Undergrad student;PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nluo2023actionquantized,\ntitle={Action-Quantized Offline Reinforcement Learning for Robotic Skill Learning},\nauthor={Jianlan Luo and Perry Dong and Jeffrey Wu and Aviral Kumar and Xinyang Geng and Sergey Levine},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=n9lew97SAn}\n}", "github": "", "project": "", "reviewers": "Pinn;g7YW;L3mY;Aebd", "site": "https://openreview.net/forum?id=n9lew97SAn", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;4", "rating_avg": 6.5, "confidence_avg": 4.0, "replies_avg": 25, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10769270996418616485&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;0", "aff_unique_norm": "Google;University of California, Berkeley", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.berkeley.edu", "aff_unique_abbr": "Google;UC Berkeley", "aff_campus_unique_index": "0;1;1;1;0", "aff_campus_unique": "Mountain View;Berkeley", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nKWQnYkkwX", "title": "Language-Guided Traffic Simulation via Scene-Level Diffusion", "track": "main", "status": "Oral", "tldr": "A scene-level conditional diffusion model with a LLM based language interface for realistic and controllable traffic simulation.", "abstract": "Realistic and controllable traffic simulation is a core capability that is necessary to accelerate autonomous vehicle (AV) development. However, current approaches for controlling learning-based traffic models require significant domain expertise and are difficult for practitioners to use. To remedy this, we present CTG++, a scene-level conditional diffusion model that can be guided by language instructions. Developing this requires tackling two challenges: the need for a realistic and controllable traffic model backbone, and an effective method to interface with a traffic model using language. To address these challenges, we first propose a scene-level diffusion model equipped with a spatio-temporal transformer backbone, which generates realistic and controllable traffic. We then harness a large language model (LLM) to convert a user's query into a loss function, guiding the diffusion model towards query-compliant generation. Through comprehensive evaluation, we demonstrate the effectiveness of our proposed method in generating realistic, query-compliant traffic simulations.", "keywords": "Traffic Simulation;Multi-Agent Diffusion;Large Language Model", "primary_area": "", "supplementary_material": "/attachment/fc3fab125b9e948c57ad371095f7120f015f8003.zip", "author": "Ziyuan Zhong;Davis Rempe;Yuxiao Chen;Boris Ivanovic;Yulong Cao;Danfei Xu;Marco Pavone;Baishakhi Ray", "authorids": "~Ziyuan_Zhong1;~Davis_Rempe1;~Yuxiao_Chen3;~Boris_Ivanovic1;~Yulong_Cao1;~Danfei_Xu1;~Marco_Pavone1;~Baishakhi_Ray2", "gender": "M;M;;;M;M;F;M", "homepage": ";https://davrempe.github.io/;http://www.borisivanovic.com/;https://kikacaty.github.io/;https://cs.stanford.edu/~danfei/;https://web.stanford.edu/~pavone/;http://rayb.info/;https://research.nvidia.com/person/yuxiao-chen", "dblp": "210/2633;199/7804;203/8356;207/6576;135/8443;91/3382-1.html;74/1969;158/4934-1", "google_scholar": "GiB1pSsAAAAJ;BVde3Y0AAAAJ;ey9AQcEAAAAJ;uclqBzgAAAAJ;J5D4kcoAAAAJ;RhOpyXcAAAAJ;https://scholar.google.com.tw/citations?user=VaAEb5YAAAAJ;AOdxmJYAAAAJ", "orcid": ";;0000-0002-8698-202X;;;;;0000-0001-5276-7156", "linkedin": ";;boris-ivanovic-a3103064;;;;;", "or_profile": "~Ziyuan_Zhong1;~Davis_Rempe1;~Boris_Ivanovic1;~Yulong_Cao1;~Danfei_Xu1;~Marco_Pavone1;~Baishakhi_Ray2;~Yuxiao_Chen2", "aff": "Columbia University;Stanford University;NVIDIA;NVIDIA;NVIDIA;Stanford University;Columbia University;California Institute of Technology", "aff_domain": "columbia.edu;stanford.edu;nvidia.com;nvidia.com;nvidia.com;stanford.edu;columbia.edu;caltech.edu", "position": "PhD student;PhD student;Researcher;Researcher;Research Scientist;Associate Professor;Assistant Professor;Postdoc", "bibtex": "@inproceedings{\nzhong2023languageguided,\ntitle={Language-Guided Traffic Simulation via Scene-Level Diffusion},\nauthor={Ziyuan Zhong and Davis Rempe and Yuxiao Chen and Boris Ivanovic and Yulong Cao and Danfei Xu and Marco Pavone and Baishakhi Ray},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=nKWQnYkkwX}\n}", "github": "", "project": "", "reviewers": "eWu2;qk9Q;J6rY;QJon", "site": "https://openreview.net/forum?id=nKWQnYkkwX", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "2;3;4;4", "rating_avg": 7.5, "confidence_avg": 3.25, "replies_avg": 10, "authors#_avg": 8, "corr_rating_confidence": 0.986440050415621, "gs_citation": 94, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4457686760702090474&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2;2;1;0;3", "aff_unique_norm": "Columbia University;Stanford University;NVIDIA;California Institute of Technology", "aff_unique_dep": ";;NVIDIA Corporation;", "aff_unique_url": "https://www.columbia.edu;https://www.stanford.edu;https://www.nvidia.com;https://www.caltech.edu", "aff_unique_abbr": "Columbia;Stanford;NVIDIA;Caltech", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Stanford;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nNsZxc2cmO", "title": "FindThis: Language-Driven Object Disambiguation in Indoor Environments", "track": "main", "status": "Poster", "tldr": "We present a new task, dataset, and method focused on language-driven object disambiguation in indoor 3D environments.", "abstract": "Natural language is naturally ambiguous. In this work, we consider interactions between a user and a mobile service robot tasked with locating a desired object, specified by a language utterance. We present a task FindThis, which addresses the problem of how to disambiguate and locate the particular object instance desired through a dialog with the user. To approach this problem we propose an algorithm, GoFind, which exploits visual attributes of the object that may be intrinsic (e.g., color, shape), or extrinsic (e.g., location, relationships to other entities), expressed in an open vocabulary. GoFind leverages the visual common sense learned by large language models to enable fine-grained object localization and attribute differentiation in a zero-shot manner. We also provide a new visio-linguistic dataset, 3D Objects in Context (3DOC), for evaluating agents on this task consisting of Google Scanned Objects placed in Habitat-Matterport 3D scenes. Finally, we validate our approach on a real robot operating in an unstructured physical office environment using complex fine-grained language instructions.", "keywords": "object disambiguation;instruction following;language interaction;visual navigation", "primary_area": "", "supplementary_material": "/attachment/8ac9f2c12f5fdbc8cc80a7e2e7fa9747cae13799.zip", "author": "Arjun Majumdar;Fei Xia;brian ichter;Dhruv Batra;Leonidas Guibas", "authorids": "~Arjun_Majumdar2;~Fei_Xia1;~brian_ichter1;~Dhruv_Batra1;~Leonidas_Guibas1", "gender": "M;M;;Not Specified;M", "homepage": "https://arjunmajum.github.io/;;;https://dhruvbatra.com;http://geometry.stanford.edu/", "dblp": "168/2927;;;67/6586;g/LeonidasJGuibas", "google_scholar": "nyicsDgAAAAJ;pqP5_PgAAAAJ;-w5DuHgAAAAJ;_bs7PqgAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ", "orcid": ";0000-0003-4343-1444;;;", "linkedin": ";;;;", "or_profile": "~Arjun_Majumdar2;~Fei_Xia1;~brian_ichter1;~Dhruv_Batra1;~Leonidas_Guibas1", "aff": "Georgia Institute of Technology;Google;Google;Georgia Institute of Technology;Stanford University", "aff_domain": "gatech.edu;google.com;google.com;gatech.edu;stanford.edu", "position": "PhD student;Researcher;Research Scientist;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nmajumdar2023findthis,\ntitle={FindThis: Language-Driven Object Disambiguation in Indoor Environments},\nauthor={Arjun Majumdar and Fei Xia and brian ichter and Dhruv Batra and Leonidas Guibas},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=nNsZxc2cmO}\n}", "github": "", "project": "", "reviewers": "r5uR;rPQj;pqhb;qZX4", "site": "https://openreview.net/forum?id=nNsZxc2cmO", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;5;4", "rating_avg": 6.0, "confidence_avg": 4.25, "replies_avg": 13, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11828810952889778327&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "Georgia Institute of Technology;Google;Stanford University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.gatech.edu;https://www.google.com;https://www.stanford.edu", "aff_unique_abbr": "Georgia Tech;Google;Stanford", "aff_campus_unique_index": "1;1;2", "aff_campus_unique": ";Mountain View;Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nyY6UgXYyfF", "title": "Adv3D: Generating Safety-Critical 3D Objects through Closed-Loop Simulation", "track": "main", "status": "Poster", "tldr": "We propose a system to generate safety-critical actor shapes for autonomy testing through closed-loop simulation of real-world traffic scenarios", "abstract": "Self-driving vehicles (SDVs) must be rigorously tested on a wide range of scenarios to ensure safe deployment. The industry typically relies on closed-loop simulation to evaluate how the SDV interacts on a corpus of synthetic and real scenarios and to verify good performance. However, they primarily only test the motion planning module of the system, and only consider behavior variations. It is key to evaluate the full autonomy system in closed-loop, and to understand how variations in sensor data based on scene appearance, such as the shape of actors, affect system performance. In this paper, we propose a framework, Adv3D, that takes real world scenarios and performs closed-loop sensor simulation to evaluate autonomy performance, and finds vehicle shapes that make the scenario more challenging, resulting in autonomy failures and uncomfortable SDV maneuvers. Unlike prior work that add contrived adversarial shapes to vehicle roof-tops or roadside to harm perception performance, we optimize a low-dimensional shape representation to modify the vehicle shape itself in a realistic manner to degrade full autonomy performance (e.g., perception, prediction, motion planning). Moreover, we find that the shape variations found with Adv3D optimized in closed-loop are much more effective than open-loop, demonstrating the importance of finding and testing scene appearance variations that affect full autonomy performance.", "keywords": "closed-loop simulation;adversarial robustness;self-driving", "primary_area": "", "supplementary_material": "/attachment/158512bbf9d989e2d8465a523f72d7550780d114.zip", "author": "Jay Sarva;Jingkang Wang;James Tu;Yuwen Xiong;Sivabalan Manivasagam;Raquel Urtasun", "authorids": "~Jay_Sarva1;~Jingkang_Wang1;~James_Tu1;~Yuwen_Xiong1;~Sivabalan_Manivasagam1;~Raquel_Urtasun1", "gender": ";M;M;M;;F", "homepage": ";http://www.cs.toronto.edu/~wangjk/;;http://www.yuwenxiong.com;;http://www.cs.toronto.edu/~urtasun/", "dblp": ";223/9910;;178/3589;;u/RaquelUrtasun", "google_scholar": ";c0BTYC4AAAAJ;https://scholar.google.ca/citations?user=x6gPeg4AAAAJ;7YALCcIAAAAJ;;https://scholar.google.ca/citations?user=jyxO2akAAAAJ", "orcid": ";;;;;", "linkedin": "jay-sarva/;;;;;", "or_profile": "~Jay_Sarva1;~Jingkang_Wang1;~James_Tu1;~Yuwen_Xiong1;~Sivabalan_Manivasagam1;~Raquel_Urtasun1", "aff": "Brown University;University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;;Department of Computer Science, University of Toronto", "aff_domain": "brown.edu;toronto.edu;cs.toronto.edu;cs.toronto.edu;;cs.toronto.edu", "position": "Undergrad student;PhD student;PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\nsarva2023advd,\ntitle={Adv3D: Generating Safety-Critical 3D Objects through Closed-Loop Simulation},\nauthor={Jay Sarva and Jingkang Wang and James Tu and Yuwen Xiong and Sivabalan Manivasagam and Raquel Urtasun},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=nyY6UgXYyfF}\n}", "github": "", "project": "", "reviewers": "xR6w;htaN;nxxv;BYfr", "site": "https://openreview.net/forum?id=nyY6UgXYyfF", "pdf_size": 0, "rating": "4;4;6;10", "confidence": "4;3;1;3", "rating_avg": 6.0, "confidence_avg": 2.75, "replies_avg": 15, "authors#_avg": 6, "corr_rating_confidence": -0.18731716231633877, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1836997029681930019&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Brown University;University of Toronto", "aff_unique_dep": ";", "aff_unique_url": "https://www.brown.edu;https://www.utoronto.ca", "aff_unique_abbr": "Brown;U of T", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";Toronto", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "United States;Canada" }, { "id": "o-K3HVUeEw", "title": "Composable Part-Based Manipulation", "track": "main", "status": "Poster", "tldr": "We compose diffusion models based on different part-part correspondences to improve learning and generalization of robotic manipulation skills.", "abstract": "In this paper, we propose composable part-based manipulation (CPM), a novel approach that leverages object-part decomposition and part-part correspondences to improve learning and generalization of robotic manipulation skills. By considering the functional correspondences between object parts, we conceptualize functional actions, such as pouring and constrained placing, as combinations of different correspondence constraints. CPM comprises a collection of composable diffusion models, where each model captures a different inter-object correspondence. These diffusion models can generate parameters for manipulation skills based on the specific object parts. Leveraging part-based correspondences coupled with the task decomposition into distinct constraints enables strong generalization to novel objects and object categories. We validate our approach in both simulated and real-world scenarios, demonstrating its effectiveness in achieving robust and generalized manipulation capabilities.", "keywords": "Manipulation;Part Decomposition;Diffusion Model", "primary_area": "", "supplementary_material": "", "author": "Weiyu Liu;Jiayuan Mao;Joy Hsu;Tucker Hermans;Animesh Garg;Jiajun Wu", "authorids": "~Weiyu_Liu1;~Jiayuan_Mao1;~Joy_Hsu2;~Tucker_Hermans2;~Animesh_Garg1;~Jiajun_Wu1", "gender": "M;F;F;M;M;M", "homepage": "http://weiyuliu.com/;http://jiayuanm.com;https://web.stanford.edu/~joycj/;https://robot-learning.cs.utah.edu;http://animesh.garg.tech;https://jiajunwu.com", "dblp": "133/0311.html;200/8283;258/5012;https://dblp.uni-trier.de/pid/67/4241;123/5728;117/4768", "google_scholar": "PHi0YEQAAAAJ;-xaOIZIAAAAJ;Zr7RJT4AAAAJ;G5_VFfkAAAAJ;zp8V7ZMAAAAJ;2efgcS0AAAAJ", "orcid": ";0000-0003-4798-3748;;0000-0003-2496-2768;0000-0003-0482-4296;0000-0002-4176-343X", "linkedin": ";;;;animeshgarg/;jiajunwu/", "or_profile": "~Weiyu_Liu1;~Jiayuan_Mao1;~Joy_Hsu2;~Tucker_Hermans2;~Animesh_Garg1;~Jiajun_Wu1", "aff": "Stanford University;Massachusetts Institute of Technology;Stanford University;University of Utah;University of Toronto;Stanford University", "aff_domain": "stanford.edu;mit.edu;stanford.edu;utah.edu;toronto.edu;stanford.edu", "position": "Postdoc;PhD student;PhD student;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2023composable,\ntitle={Composable Part-Based Manipulation},\nauthor={Weiyu Liu and Jiayuan Mao and Joy Hsu and Tucker Hermans and Animesh Garg and Jiajun Wu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=o-K3HVUeEw}\n}", "github": "https://cpmcorl2023.github.io/", "project": "", "reviewers": "ayCc;3XuZ;djYt;6mc4", "site": "https://openreview.net/forum?id=o-K3HVUeEw", "pdf_size": 0, "rating": "1;6;6;6", "confidence": "4;5;3;4", "rating_avg": 4.75, "confidence_avg": 4.0, "replies_avg": 10, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8993644855914312131&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;2;3;0", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology;University of Utah;University of Toronto", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu;https://www.utah.edu;https://www.utoronto.ca", "aff_unique_abbr": "Stanford;MIT;Utah;U of T", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;Canada" }, { "id": "o2wNSCTkq0", "title": "Learning Sequential Acquisition Policies for Robot-Assisted Feeding", "track": "main", "status": "Poster", "tldr": "We present a hierarchical framework for dexterous, long-horizon bite acquisition in robot-assisted feeding.", "abstract": "A robot providing mealtime assistance must perform specialized maneuvers with various utensils in order to pick up and feed a range of food items. Beyond these dexterous low-level skills, an assistive robot must also plan these strategies in sequence over a long horizon to clear a plate and complete a meal. Previous methods in robot-assisted feeding introduce highly specialized primitives for food handling without a means to compose them together. Meanwhile, existing approaches to long-horizon manipulation lack the flexibility to embed highly specialized primitives into their frameworks. We propose Visual Action Planning OveR Sequences (VAPORS), a framework for long-horizon food acquisition. VAPORS learns a policy for high-level action selection by leveraging learned latent plate dynamics in simulation. To carry out sequential plans in the real world, VAPORS delegates action execution to visually parameterized primitives. We validate our approach on complex real-world acquisition trials involving noodle acquisition and bimanual scooping of jelly beans. Across 38 plates, VAPORS acquires much more efficiently than baselines, generalizes across realistic plate variations such as toppings and sauces, and qualitatively appeals to user feeding preferences in a survey conducted across 49 individuals. Code, datasets, videos, and supplementary materials can be found on our website: https://sites.google.com/view/vaporsbot.", "keywords": "Deformable Manipulation;Dexterous Manipulation", "primary_area": "", "supplementary_material": "/attachment/d6f7deb364b6b635f5516b86ba3bce9c4038fb0c.zip", "author": "Priya Sundaresan;Jiajun Wu;Dorsa Sadigh", "authorids": "~Priya_Sundaresan1;~Jiajun_Wu1;~Dorsa_Sadigh1", "gender": "F;M;F", "homepage": ";https://jiajunwu.com;https://dorsa.fyi/", "dblp": ";117/4768;117/3174", "google_scholar": "7SUquR4AAAAJ;2efgcS0AAAAJ;ZaJEZpYAAAAJ", "orcid": ";0000-0002-4176-343X;", "linkedin": ";jiajunwu/;", "or_profile": "~Priya_Sundaresan1;~Jiajun_Wu1;~Dorsa_Sadigh1", "aff": "Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nsundaresan2023learning,\ntitle={Learning Sequential Acquisition Policies for Robot-Assisted Feeding},\nauthor={Priya Sundaresan and Jiajun Wu and Dorsa Sadigh},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=o2wNSCTkq0}\n}", "github": "https://sites.google.com/view/vaporsbot", "project": "", "reviewers": "cA3F;XrpJ;iomA", "site": "https://openreview.net/forum?id=o2wNSCTkq0", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;5;4", "rating_avg": 7.333333333333333, "confidence_avg": 4.333333333333333, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4680222469848966897&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "o82EXEK5hu6", "title": "Parting with Misconceptions about Learning-based Vehicle Motion Planning", "track": "main", "status": "Poster", "tldr": "", "abstract": "The release of nuPlan marks a new era in vehicle motion planning research, offering the first large-scale real-world dataset and evaluation schemes requiring both precise short-term planning and long-horizon ego-forecasting. Existing systems struggle to simultaneously meet both requirements. Indeed, we find that these tasks are fundamentally misaligned and should be addressed independently. We further assess the current state of closed-loop planning in the field, revealing the limitations of learning-based methods in complex real-world scenarios and the value of simple rule-based priors such as centerline selection through lane graph search algorithms. More surprisingly, for the open-loop sub-task, we observe that the best results are achieved when using only this centerline as scene context (i.e., ignoring all information regarding the map and other agents). Combining these insights, we propose an extremely simple and efficient planner which outperforms an extensive set of competitors, winning the nuPlan planning challenge 2023.", "keywords": "Motion Planning;Autonomous Driving;Data-driven Simulation", "primary_area": "", "supplementary_material": "/attachment/1f2f8229f27eefd4c07945aec8315e8f5128119e.zip", "author": "Daniel Dauner;Marcel Hallgarten;Andreas Geiger;Kashyap Chitta", "authorids": "~Daniel_Dauner1;~Marcel_Hallgarten1;~Andreas_Geiger3;~Kashyap_Chitta1", "gender": ";M;M;M", "homepage": "https://danieldauner.github.io/;https://uni-tuebingen.de/fr/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/kognitive-systeme/the-chair/staff/marcel-hallgarten/;http://www.cvlibs.net;https://kashyap7x.github.io/", "dblp": "349/4864;;40/5825-1;220/3765", "google_scholar": "tZqIYDcAAAAJ;;https://scholar.google.ca/citations?hl=en;vX5i2CcAAAAJ", "orcid": ";;0000-0002-8151-3726;", "linkedin": ";;;", "or_profile": "~Daniel_Dauner1;~Marcel_Hallgarten1;~Andreas_Geiger3;~Kashyap_Chitta1", "aff": "Eberhard-Karls-Universit\u00e4t T\u00fcbingen;Eberhard-Karls-Universit\u00e4t T\u00fcbingen;University of Tuebingen;University of T\u00fcbingen", "aff_domain": "uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de;uni-tuebingen.de", "position": "MS student;PhD student;Professor;PhD student", "bibtex": "@inproceedings{\ndauner2023parting,\ntitle={Parting with Misconceptions about Learning-based Vehicle Motion Planning},\nauthor={Daniel Dauner and Marcel Hallgarten and Andreas Geiger and Kashyap Chitta},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=o82EXEK5hu6}\n}", "github": "https://github.com/autonomousvision/tuplan_garage", "project": "", "reviewers": "qggz;xfB2;KEQU;7G8b", "site": "https://openreview.net/forum?id=o82EXEK5hu6", "pdf_size": 0, "rating": "4;4;10;10", "confidence": "4;3;4;3", "rating_avg": 7.0, "confidence_avg": 3.5, "replies_avg": 16, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 141, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8280862561958389402&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Eberhard Karls University of T\u00fcbingen;University of Tuebingen;University of T\u00fcbingen", "aff_unique_dep": ";;", "aff_unique_url": "https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/;https://www.uni-tuebingen.de/", "aff_unique_abbr": "Uni T\u00fcbingen;Uni T\u00fcbingen;Uni T\u00fcbingen", "aff_campus_unique_index": "0;0", "aff_campus_unique": "T\u00fcbingen;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "oqOfLP6bJy", "title": "Contrastive Value Learning: Implicit Models for Simple Offline RL", "track": "main", "status": "Poster", "tldr": "", "abstract": "Model-based reinforcement learning (RL) methods are appealing in the offline setting because they allow an agent to reason about the consequences of actions without interacting with the environment. While conventional model-based methods learn a 1-step model, predicting the immediate next state, these methods must be plugged into larger planning or RL systems to yield a policy. Can we model the environment dynamics in a different way, such that the learned model directly indicates the value of each action? In this paper, we propose Contrastive Value Learning (CVL), which learns an implicit, multi-step dynamics model. This model can be learned without access to reward functions, but nonetheless can be used to directly estimate the value of each action, without requiring any TD learning. Because this model represents the multi-step transitions implicitly, it avoids having to predict high-dimensional observations and thus scales to high-dimensional tasks. Our experiments demonstrate that CVL outperforms prior offline RL methods on complex robotics benchmarks.", "keywords": "reinforcement learning;robotics;metaworld;unsupervised learning;contrastive learning;noise-contrastive estimation;generative model", "primary_area": "", "supplementary_material": "/attachment/65b3d9b928f68dc8b2c265a4007b73918d4cc4b0.zip", "author": "Bogdan Mazoure;Benjamin Eysenbach;Ofir Nachum;Jonathan Tompson", "authorids": "~Bogdan_Mazoure1;~Benjamin_Eysenbach1;~Ofir_Nachum1;~Jonathan_Tompson1", "gender": "M;M;M;M", "homepage": "https://bmazoure.github.io;https://ben-eysenbach.github.io/;https://scholar.google.com/citations?user=C-ZlBWMAAAAJ&hl=en;http://jonathantompson.com", "dblp": ";192/1863;;139/0769", "google_scholar": "https://scholar.google.ca/citations?user=NaxShlcAAAAJ;DRnOvU8AAAAJ;C-ZlBWMAAAAJ;U_Jw8DUAAAAJ", "orcid": ";0009-0000-7136-6307;;", "linkedin": ";benjamin-eysenbach-a7235775/;;", "or_profile": "~Bogdan_Mazoure1;~Benjamin_Eysenbach1;~Ofir_Nachum1;~Jonathan_Tompson1", "aff": "Apple;Carnegie Mellon University;OpenAI;Google DeepMind", "aff_domain": "apple.com;cmu.edu;openai.com;google.com", "position": "Research Scientist;PhD student;Researcher;Researcher", "bibtex": "@inproceedings{\nmazoure2023contrastive,\ntitle={Contrastive Value Learning: Implicit Models for Simple Offline {RL}},\nauthor={Bogdan Mazoure and Benjamin Eysenbach and Ofir Nachum and Jonathan Tompson},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=oqOfLP6bJy}\n}", "github": "", "project": "", "reviewers": "X3Ux;W8jU;pfTt;GZ9B", "site": "https://openreview.net/forum?id=oqOfLP6bJy", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "2;3;3;4", "rating_avg": 7.0, "confidence_avg": 3.0, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 0.816496580927726, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2551244377231439780&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "Apple;Carnegie Mellon University;OpenAI;Google", "aff_unique_dep": "Apple Inc.;;;Google DeepMind", "aff_unique_url": "https://www.apple.com;https://www.cmu.edu;https://openai.com;https://deepmind.com", "aff_unique_abbr": "Apple;CMU;OpenAI;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "oyWkrG-LD5", "title": "Geometry Matching for Multi-Embodiment Grasping", "track": "main", "status": "Poster", "tldr": "", "abstract": "While significant progress has been made on the problem of generating grasps, many existing learning-based approaches still concentrate on a single embodiment, provide limited generalization to higher DoF end-effectors and cannot capture a diverse set of grasp modes. In this paper, we tackle the problem of grasping multi-embodiments through the viewpoint of learning rich geometric representations for both objects and end-effectors using Graph Neural Networks (GNN). Our novel method - GeoMatch - applies supervised learning on grasping data from multiple embodiments, learning end-to-end contact point likelihood maps as well as conditional autoregressive prediction of grasps keypoint-by-keypoint. We compare our method against 3 baselines that provide multi-embodiment support. Our approach performs better across 3 end-effectors, while also providing competitive diversity of grasps. Examples can be found at geomatch.github.io.", "keywords": "Multi-Embodiment;Dexterous Grasping;Graph Neural Networks", "primary_area": "", "supplementary_material": "/attachment/a3a50c974fde19e0c797782478146fbae0362aa6.zip", "author": "Maria Attarian;Muhammad Adil Asif;Jingzhou Liu;Ruthrash Hari;Animesh Garg;Igor Gilitschenski;Jonathan Tompson", "authorids": "~Maria_Attarian1;~Muhammad_Adil_Asif1;~Jingzhou_Liu2;ruthrash.hari@mail.utoronto.ca;~Animesh_Garg1;~Igor_Gilitschenski1;~Jonathan_Tompson1", "gender": "F;M;M;;M;M;M", "homepage": "https://jmattarian.com/;https://www.cs.toronto.edu/~adilasif/;https://jasonjzliu.com/;;http://animesh.garg.tech;https://www.gilitschenski.org/igor;http://jonathantompson.com", "dblp": ";;;;123/5728;129/1281;139/0769", "google_scholar": "6Hk7QdkAAAAJ;-3BA9OEAAAAJ;iktexc8AAAAJ;;zp8V7ZMAAAAJ;Nuw1Y4oAAAAJ;U_Jw8DUAAAAJ", "orcid": ";;;;0000-0003-0482-4296;;", "linkedin": "maria-attarian/;;jasonjzliu/;;animeshgarg/;igorgilitschenski/;", "or_profile": "~Maria_Attarian1;~Muhammad_Adil_Asif1;~Jingzhou_Liu2;ruthrash.hari@mail.utoronto.ca;~Animesh_Garg1;~Igor_Gilitschenski1;~Jonathan_Tompson1", "aff": "Google;University of Toronto;University of Toronto;;University of Toronto;University of Toronto;Google DeepMind", "aff_domain": "google.com;utoronto.ca;utoronto.ca;;toronto.edu;toronto.edu;google.com", "position": "Researcher;Undergrad student;Undergrad student;;Assistant Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nattarian2023geometry,\ntitle={Geometry Matching for Multi-Embodiment Grasping},\nauthor={Maria Attarian and Muhammad Adil Asif and Jingzhou Liu and Ruthrash Hari and Animesh Garg and Igor Gilitschenski and Jonathan Tompson},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=oyWkrG-LD5}\n}", "github": "", "project": "", "reviewers": "U6zR;vXSr;dMoq;AGep", "site": "https://openreview.net/forum?id=oyWkrG-LD5", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "5;3;5;3", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 15, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16728594368705784326&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;1;1;1;1;0", "aff_unique_norm": "Google;University of Toronto", "aff_unique_dep": "Google;", "aff_unique_url": "https://www.google.com;https://www.utoronto.ca", "aff_unique_abbr": "Google;U of T", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;1;1;1;1;2", "aff_country_unique": "United States;Canada;United Kingdom" }, { "id": "pLCQkMojXI", "title": "Rearrangement Planning for General Part Assembly", "track": "main", "status": "Oral", "tldr": "We propose to investigate general part assembly, the task of creating novel target assemblies with unseen part shapes, and introduce General Part Assembly Transformer (GPAT) for the task.", "abstract": "Most successes in autonomous robotic assembly have been restricted to single target or category. We propose to investigate general part assembly, the task of creating novel target assemblies with unseen part shapes. As a fundamental step to a general part assembly system, we tackle the task of determining the precise poses of the parts in the target assembly, which we term \u201crearrangement planning\u201d. We present General Part Assembly Transformer (GPAT), a transformer-based model architecture that accurately predicts part poses by inferring how each part shape corresponds to the target shape. Our experiments on both 3D CAD models and real-world scans demonstrate GPAT\u2019s generalization abilities to novel and diverse target and part shapes.", "keywords": "robotic assembly;pose estimation;3D perception", "primary_area": "", "supplementary_material": "/attachment/16bb00528be09f18c771eb41bb0afe089956e5ec.zip", "author": "Yulong Li;Andy Zeng;Shuran Song", "authorids": "~Yulong_Li1;~Andy_Zeng3;~Shuran_Song3", "gender": "M;F;M", "homepage": "https://yulongli42.github.io;https://shurans.github.io/;http://andyzeng.github.io/", "dblp": ";;http://dblp.uni-trier.de/pers/hd/z/Zeng:Andy", "google_scholar": ";https://scholar.google.com/citations?hl=en;q7nFtUcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Yulong_Li1;~Shuran_Song3;~Andy_Zeng1", "aff": "Columbia University;Columbia University;Google", "aff_domain": "columbia.edu;cs.columbia.edu;google.com", "position": "Undergrad student;Assistant Professor;Research Scientist", "bibtex": "@inproceedings{\nli2023rearrangement,\ntitle={Rearrangement Planning for General Part Assembly},\nauthor={Yulong Li and Andy Zeng and Shuran Song},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=pLCQkMojXI}\n}", "github": "https://github.com/real-stanford/gpat", "project": "", "reviewers": "wPeg;v4vx;sArU;JQms", "site": "https://openreview.net/forum?id=pLCQkMojXI", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "4;4;4;4", "rating_avg": 7.0, "confidence_avg": 4.0, "replies_avg": 13, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6954861317270508985&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Columbia University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.columbia.edu;https://www.google.com", "aff_unique_abbr": "Columbia;Google", "aff_campus_unique_index": "1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "psLlVbTFBua", "title": "FlowBot++: Learning Generalized Articulated Objects Manipulation via Articulation Projection", "track": "main", "status": "Poster", "tldr": "A novel learned visual representation for articulated objects manipulation", "abstract": "Understanding and manipulating articulated objects, such as doors and drawers, is crucial for robots operating in human environments. We wish to develop a system that can learn to articulate novel objects with no prior interaction, after training on other articulated objects. Previous approaches for articulated object manipulation rely on either modular methods which are brittle or end-to-end methods, which lack generalizability. This paper presents FlowBot++, a deep 3D vision-based robotic system that predicts dense per-point motion and dense articulation parameters of articulated objects to assist in downstream manipulation tasks. FlowBot++ introduces a novel per-point representation of the articulated motion and articulation parameters that are combined to produce a more accurate estimate than either method on their own. Simulated experiments on the PartNet-Mobility dataset validate the performance of our system in articulating a wide range of objects, while real-world experiments on real objects' point clouds and a Sawyer robot demonstrate the generalizability and feasibility of our system in real-world scenarios. Videos are available on our anonymized website https://sites.google.com/view/flowbotpp/home", "keywords": "Articulated objects manipulation;representation learning", "primary_area": "", "supplementary_material": "/attachment/201d9d5eede50d63d199087b89ed6cbdbb12a127.zip", "author": "Harry Zhang;Ben Eisner;David Held", "authorids": "~Harry_Zhang2;~Ben_Eisner1;~David_Held1", "gender": "M;M;M", "homepage": "https://harryzhangog.github.io/;;http://davheld.github.io/", "dblp": ";;22/11147", "google_scholar": "e-p7KiUAAAAJ;RWe-v0UAAAAJ;0QtU-NsAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Harry_Zhang2;~Ben_Eisner1;~David_Held1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;cmu.edu", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2023flowbot,\ntitle={FlowBot++: Learning Generalized Articulated Objects Manipulation via Articulation Projection},\nauthor={Harry Zhang and Ben Eisner and David Held},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=psLlVbTFBua}\n}", "github": "https://sites.google.com/view/flowbotpp/home", "project": "", "reviewers": "frrv;E3aG;8kbD;uUTN", "site": "https://openreview.net/forum?id=psLlVbTFBua", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;5;3;4", "rating_avg": 6.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 35, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5027766040099322660&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "psyvs5wdAV", "title": "Equivariant Motion Manifold Primitives", "track": "main", "status": "Poster", "tldr": "We propose a new family of highly adaptable primitive models, Equivariant Motion Manifold Primitives (EMMP), which consider inherent symmetry in the robot tasks.", "abstract": "Existing movement primitive models for the most part focus on representing and generating a single trajectory for a given task, limiting their adaptability to situations in which unforeseen obstacles or new constraints may arise. In this work we propose Motion Manifold Primitives (MMP), a movement primitive paradigm that encodes and generates, for a given task, a continuous manifold of trajectories each of which can achieve the given task. To address the challenge of learning each motion manifold from a limited amount of data, we exploit inherent symmetries in the robot task by constructing motion manifold primitives that are equivariant with respect to given symmetry groups. Under the assumption that each of the MMPs can be smoothly deformed into each other, an autoencoder framework is developed to encode the MMPs and also generate solution trajectories. Experiments involving synthetic and real-robot examples demonstrate that our method outperforms existing manifold primitive methods by significant margins. Code is available at https://github.com/dlsfldl/EMMP-public.", "keywords": "Movement primitives;Manifold;LfD;Equivariance", "primary_area": "", "supplementary_material": "/attachment/9786df0333d326f274b642fafb852cc7ac528e51.zip", "author": "Byeongho Lee;Yonghyeon Lee;Seungyeon Kim;MinJun Son;Frank C. Park", "authorids": "~Byeongho_Lee2;~Yonghyeon_Lee2;~Seungyeon_Kim2;~MinJun_Son1;~Frank_C._Park1", "gender": ";M;M;M;M", "homepage": ";https://www.gabe-yhlee.com;https://seungyeon-k.github.io/;https://sites.google.com/robotics.snu.ac.kr/fcp/;http://robotics.snu.ac.kr", "dblp": ";182/6796;74/7997-3;;p/FrankChongwooPark", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;u-h3PJIAAAAJ", "orcid": ";;0000-0001-6708-5684;;0000-0002-0293-6975", "linkedin": ";;seungyeon-kim-45a20b263/;;", "or_profile": "~Byeongho_Lee2;~Yonghyeon_Lee2;~Seungyeon_Kim2;~MinJun_Son1;~Frank_C._Park1", "aff": ";Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": ";snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": ";PhD student;PhD student;MS student;Full Professor", "bibtex": "@inproceedings{\nlee2023equivariant,\ntitle={Equivariant Motion Manifold Primitives},\nauthor={Byeongho Lee and Yonghyeon Lee and Seungyeon Kim and MinJun Son and Frank C. Park},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=psyvs5wdAV}\n}", "github": "https://github.com/dlsfldl/EMMP-public", "project": "", "reviewers": "ZGAZ;rfrA;K1vn;3SRv", "site": "https://openreview.net/forum?id=psyvs5wdAV", "pdf_size": 0, "rating": "6;10;10;10", "confidence": "3;4;3;5", "rating_avg": 9.0, "confidence_avg": 3.75, "replies_avg": 11, "authors#_avg": 5, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6845458895865587674&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "pw-OTIYrGa", "title": "On the Utility of Koopman Operator Theory in Learning Dexterous Manipulation Skills", "track": "main", "status": "Oral", "tldr": "This paper investigates the utility of Koopman operator theory on dexterous manipulation tasks and reveals a number of unique benefits.", "abstract": "Despite impressive dexterous manipulation capabilities enabled by learning-based approaches, we are yet to witness widespread adoption beyond well-resourced laboratories. This is likely due to practical limitations, such as significant computational burden, inscrutable learned behaviors, sensitivity to initialization, and the considerable technical expertise required for implementation. In this work, we investigate the utility of Koopman operator theory in alleviating these limitations. Koopman operators are simple yet powerful control-theoretic structures to represent complex nonlinear dynamics as linear systems in higher dimensions. Motivated by the fact that complex nonlinear dynamics underlie dexterous manipulation, we develop a Koopman operator-based imitation learning framework to learn the desired motions of both the robotic hand and the object simultaneously. We show that Koopman operators are surprisingly effective for dexterous manipulation and offer a number of unique benefits. Notably, policies can be learned analytically, drastically reducing computation burden and eliminating sensitivity to initialization and the need for painstaking hyperparameter optimization. Our experiments reveal that a Koopman operator-based approach can perform comparably to state-of-the-art imitation learning algorithms in terms of success rate and sample efficiency, while being an order of magnitude faster. Policy videos can be viewed at https://sites.google.com/view/kodex-corl.", "keywords": "Koopman Operator;Dexterous Manipulation", "primary_area": "", "supplementary_material": "/attachment/fc07563029d3ac27eaf32b96bd47feb73dd16d39.zip", "author": "Yunhai Han;Mandy Xie;Ye Zhao;Harish Ravichandar", "authorids": "~Yunhai_Han1;~Mandy_Xie1;~Ye_Zhao2;~Harish_Ravichandar1", "gender": "M;F;M;", "homepage": "https://y8han.github.io/;https://mandyxie.github.io/;http://lab-idar.gatech.edu/;http://harishravichandar.com/", "dblp": "276/6126;;;237/9959", "google_scholar": "lsN3nY8AAAAJ;;;d2HP6SMAAAAJ", "orcid": ";;;0000-0002-6635-2637", "linkedin": ";;;", "or_profile": "~Yunhai_Han1;~Mandy_Xie1;~Ye_Zhao2;~Harish_Ravichandar1", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhan2023on,\ntitle={On the Utility of Koopman Operator Theory in Learning Dexterous Manipulation Skills},\nauthor={Yunhai Han and Mandy Xie and Ye Zhao and Harish Ravichandar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=pw-OTIYrGa}\n}", "github": "https://github.com/GT-STAR-Lab/KODex", "project": "", "reviewers": "k76a;RjjK;aS5H", "site": "https://openreview.net/forum?id=pw-OTIYrGa", "pdf_size": 0, "rating": "6;6;10", "confidence": "4;3;3", "rating_avg": 7.333333333333333, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2114076631146679498&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "q0VAoefCI2", "title": "Task-Oriented Koopman-Based Control with Contrastive Encoder", "track": "main", "status": "Oral", "tldr": "We present task-oriented Koopman-based control that utilizes end-to-end reinforcement learning and contrastive encoder to simultaneously learn the Koopman latent embedding, operator and associated linear controller within an iterative loop", "abstract": "We present task-oriented Koopman-based control that utilizes end-to-end reinforcement learning and contrastive encoder to simultaneously learn the Koopman latent embedding, operator, and associated linear controller within an iterative loop. By prioritizing the task cost as the main objective for controller learning, we reduce the reliance of controller design on a well-identified model, which, for the first time to the best of our knowledge, extends Koopman control from low to high-dimensional, complex nonlinear systems, including pixel-based tasks and a real robot with lidar observations. Code and videos are available: https://sites.google.com/view/kpmlilatsupp/.", "keywords": "Learning and control;Koopman-based control;Represention learning", "primary_area": "", "supplementary_material": "", "author": "Xubo Lyu;Hanyang Hu;Seth Siriya;Ye Pu;Mo Chen", "authorids": "~Xubo_Lyu1;~Hanyang_Hu1;~Seth_Siriya1;~Ye_Pu1;~Mo_Chen1", "gender": "M;M;;F;M", "homepage": "https://xubo92.github.io/;;;https://sites.google.com/site/yepuhomepage/;http://www.sfu.ca/~mochen/", "dblp": ";;;;", "google_scholar": "https://scholar.google.ca/citations?user=xXurDVUAAAAJ;https://scholar.google.ca/citations?user=d016pxMAAAAJ;8nQmRW0AAAAJ;https://scholar.google.com.au/citations?user=5IlwSs8AAAAJ;https://scholar.google.ca/citations?user=19UAgLUAAAAJ", "orcid": ";;;;0000-0001-8506-3665", "linkedin": ";;;;", "or_profile": "~Xubo_Lyu1;~Hanyang_Hu1;~Seth_Siriya1;~Ye_Pu1;~Mo_Chen1", "aff": "Simon Fraser University;Simon Fraser University;University of Melbourne;University of Melbourne;Simon Fraser University", "aff_domain": "sfu.ca;sfu.ca;unimelb.edu;unimelb.edu.au;sfu.ca", "position": "PhD student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nlyu2023taskoriented,\ntitle={Task-Oriented Koopman-Based Control with Contrastive Encoder},\nauthor={Xubo Lyu and Hanyang Hu and Seth Siriya and Ye Pu and Mo Chen},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=q0VAoefCI2}\n}", "github": "https://sites.google.com/view/kpmlilatsupp/", "project": "", "reviewers": "kvw1;n3f2;rfrp;XqmK", "site": "https://openreview.net/forum?id=q0VAoefCI2", "pdf_size": 0, "rating": "6;6;6;10", "confidence": "3;5;3;3", "rating_avg": 7.0, "confidence_avg": 3.5, "replies_avg": 14, "authors#_avg": 5, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13153013578825897572&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Simon Fraser University;University of Melbourne", "aff_unique_dep": ";", "aff_unique_url": "https://www.sfu.ca;https://www.unimelb.edu.au", "aff_unique_abbr": "SFU;UniMelb", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1;0", "aff_country_unique": "Canada;Australia" }, { "id": "qVc7NWYTRZ6", "title": "An Unbiased Look at Datasets for Visuo-Motor Pre-Training", "track": "main", "status": "Poster", "tldr": "Our data-centric analysis busts some common myths in visuo-motor pre-training! We find that old-school datasets (e.g. ImageNet) outcompete SOTA robotics baselines (trained on 5x more Ego4D data), and offer simple guidelines for improving evaluation.", "abstract": "Visual representation learning hold great promise for robotics, but is severely hampered by the scarcity and homogeneity of robotics datasets. Recent works address this problem by pre-training visual representations on large-scale but out-of-domain data (e.g., videos of egocentric interactions) and then transferring them to target robotics tasks. While the field is heavily focused on developing better pre-training algorithms, we find that dataset choice is just as important to this paradigm's success. After all, the representation can only learn the structures or priors present in the pre-training dataset. To this end, we flip the focus on algorithms, and instead conduct a dataset centric analysis of robotic pre-training. Our findings call into question some common wisdom in the field. We observe that traditional vision datasets (like ImageNet, Kinetics and 100 Days of Hands) are surprisingly competitive options for visuo-motor representation learning, and that the pre-training dataset's image distribution matters more than its size. Finally, we show that common simulation benchmarks are not a reliable proxy for real world performance and that simple regularization strategies can dramatically improve real world policy learning.", "keywords": "Visual Representation Learning;Datasets;Manipulation", "primary_area": "", "supplementary_material": "/attachment/4215901de79e64bbb283f5c28da5b7b85f054cae.zip", "author": "Sudeep Dasari;Mohan Kumar Srirama;Unnat Jain;Abhinav Gupta", "authorids": "~Sudeep_Dasari2;~Mohan_Kumar_Srirama1;~Unnat_Jain1;~Abhinav_Gupta1", "gender": ";M;;M", "homepage": ";https://www.mohansrirama.com;;http://www.cs.cmu.edu/~abhinavg", "dblp": ";;;36/7024-1", "google_scholar": ";;;https://scholar.google.com.tw/citations?user=bqL73OkAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Sudeep_Dasari2;~Mohan_Kumar_Srirama1;~Unnat_Jain1;~Abhinav_Gupta1", "aff": ";Carnegie Mellon University;;Carnegie Mellon University", "aff_domain": ";cmu.edu;;cmu.edu", "position": ";Researcher;;Full Professor", "bibtex": "@inproceedings{\ndasari2023an,\ntitle={An Unbiased Look at Datasets for Visuo-Motor Pre-Training},\nauthor={Sudeep Dasari and Mohan Kumar Srirama and Unnat Jain and Abhinav Gupta},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=qVc7NWYTRZ6}\n}", "github": "https://github.com/SudeepDasari/data4robotics", "project": "", "reviewers": "35RN;FvgU;SPNn;x3Nd", "site": "https://openreview.net/forum?id=qVc7NWYTRZ6", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "3;4;4;4", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 10, "authors#_avg": 4, "corr_rating_confidence": 1.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15252682809293010562&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "rOCWUmMBSnH", "title": "A Policy Optimization Method Towards Optimal-time Stability", "track": "main", "status": "Poster", "tldr": "", "abstract": "In current model-free reinforcement learning (RL) algorithms, stability criteria based on sampling methods are commonly utilized to guide policy optimization. However, these criteria only guarantee the infinite-time convergence of the system's state to an equilibrium point, which leads to sub-optimality of the policy. In this paper, we propose a policy optimization technique incorporating sampling-based Lyapunov stability. Our approach enables the system's state to reach an equilibrium point within an optimal time and maintain stability thereafter, referred to as \"\\textit{optimal-time stability}\". To achieve this, we integrate the optimization method into the Actor-Critic framework, resulting in the development of the Adaptive Lyapunov-based Actor-Critic (ALAC) algorithm. Through evaluations conducted on ten robotic tasks, our approach outperforms previous studies significantly, effectively guiding the system to generate stable patterns.", "keywords": "Reinforcement Learning;Robotic Control;Stability", "primary_area": "", "supplementary_material": "/attachment/f954081131f74a62f84a32ee86a8b097cd668c48.zip", "author": "Shengjie Wang;Lan Fengb;Xiang Zheng;Yuxue Cao;Oluwatosin OluwaPelumi Oseni;Haotian Xu;Tao Zhang;Yang Gao", "authorids": "~Shengjie_Wang2;~Lan_Fengb2;~Xiang_Zheng4;~Yuxue_Cao1;~Oluwatosin_OluwaPelumi_Oseni1;xuht1102@gmail.com;~Tao_Zhang9;~Yang_Gao1", "gender": "M;M;;;M;;;M", "homepage": "https://shengjiewang-jason.github.io/;https://github.com/lanrobot;;;https://tohsin.github.io/;;;http://yang-gao.weebly.com", "dblp": ";;;;;;;89/4402-29", "google_scholar": ";;;;;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;;;", "linkedin": ";;;;;;;yang-gao-45245348/", "or_profile": "~Shengjie_Wang2;~Lan_Fengb2;~Xiang_Zheng4;~Yuxue_Cao1;~Oluwatosin_OluwaPelumi_Oseni1;xuht1102@gmail.com;~Tao_Zhang9;~Yang_Gao1", "aff": "Tsinghua University;Tsinghua University;;;;;;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;;;;;;tsinghua.edu.cn", "position": "PhD student;PhD student;;;;;;Assistant Professor", "bibtex": "@inproceedings{\nwang2023a,\ntitle={A Policy Optimization Method Towards Optimal-time Stability},\nauthor={Shengjie Wang and Lan Fengb and Xiang Zheng and Yuxue Cao and Oluwatosin OluwaPelumi Oseni and Haotian Xu and Tao Zhang and Yang Gao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rOCWUmMBSnH}\n}", "github": "", "project": "", "reviewers": "bi85;fiUo;M8UP", "site": "https://openreview.net/forum?id=rOCWUmMBSnH", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;3", "rating_avg": 6.0, "confidence_avg": 3.0, "replies_avg": 12, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15424571955441011392&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "rPye6EZxmI", "title": "Reinforcement Learning Enables Real-Time Planning and Control of Agile Maneuvers for Soft Robot Arms", "track": "main", "status": "Poster", "tldr": "This is the first work that demonstrates real-time planning and control of agile maneuvers by soft robot arms, which is achieved by using reinforcement learning and key insights to overcome sim-to-real challenges for zero-shot sim-to-real transfer.", "abstract": "Control policies for soft robot arms typically assume quasi-static motion or require a hand-designed motion plan. To achieve real-time planning and control for tasks requiring highly dynamic maneuvers, we apply deep reinforcement learning to train a policy entirely in simulation, and we identify strategies and insights that bridge the gap between simulation and reality. In particular, we strengthen the policy\u2019s tolerance for inaccuracies with domain randomization and implement crucial simulator modifications that improve actuation and sensor modeling, enabling zero-shot sim-to-real transfer without requiring high-fidelity soft robot dynamics. We demonstrate the effectiveness of this approach with experiments on physical hardware and show that our soft robot can reach target positions that require dynamic swinging motions. This is the first work to achieve such agile maneuvers on a physical soft robot, advancing the field of soft robot arm planning and control. Our code and videos are publicly available at https://sites.google.com/view/rl-soft-robot.", "keywords": "Soft Robotics;Reinforcement Learning;Sim-to-Real Transfer;Dynamics and Control", "primary_area": "", "supplementary_material": "/attachment/4a28cc01902ebe9db97b396f2fffedf742551738.zip", "author": "Rianna Jitosho;Tyler Ga Wei Lum;Allison Okamura;Karen Liu", "authorids": "~Rianna_Jitosho1;~Tyler_Ga_Wei_Lum1;~Allison_Okamura1;~Karen_Liu1", "gender": ";M;F;", "homepage": "https://rjitosho.github.io/;https://tylerlum.github.io/;http://charm.stanford.edu;https://cs.stanford.edu/~karenliu", "dblp": ";;;", "google_scholar": ";kPq6-XIAAAAJ;lD4Yjn4AAAAJ;i28fU0MAAAAJ", "orcid": ";;0000-0002-6912-1666;0000-0001-5926-0905", "linkedin": ";tyler-lum/;allison-okamura/;", "or_profile": "~Rianna_Jitosho1;~Tyler_Ga_Wei_Lum1;~Allison_Okamura1;~Karen_Liu1", "aff": "Stanford University;Stanford University;;Computer Science Department, Stanford University", "aff_domain": "stanford.edu;stanford.edu;;cs.stanford.edu", "position": "PhD student;PhD student;;Full Professor", "bibtex": "@inproceedings{\njitosho2023reinforcement,\ntitle={Reinforcement Learning Enables Real-Time Planning and Control of Agile Maneuvers for Soft Robot Arms},\nauthor={Rianna Jitosho and Tyler Ga Wei Lum and Allison Okamura and Karen Liu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rPye6EZxmI}\n}", "github": "https://github.com/tylerlum/Vine_Robot_IsaacGymEnvs", "project": "", "reviewers": "y2wh;UnPS;sYng", "site": "https://openreview.net/forum?id=rPye6EZxmI", "pdf_size": 0, "rating": "6;6;10", "confidence": "3;4;4", "rating_avg": 7.333333333333333, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5315100802602746307&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "rYZBdBytxBx", "title": "HOI4ABOT: Human-Object Interaction Anticipation for Human Intention Reading Collaborative roBOTs", "track": "main", "status": "Poster", "tldr": "A deep learning framework for recognizing human intention through the anticipation of human-object interaction and its implementation in robot assistive tasks.", "abstract": "Robots are becoming increasingly integrated into our lives, assisting us in various tasks. To ensure effective collaboration between humans and robots, it is essential that they understand our intentions and anticipate our actions. In this paper, we propose a Human-Object Interaction (HOI) anticipation framework for collaborative robots. We propose an efficient and robust transformer-based model to detect and anticipate HOIs from videos. This enhanced anticipation empowers robots to proactively assist humans, resulting in more efficient and intuitive collaborations. Our model outperforms state-of-the-art results in HOI detection and anticipation in VidHOI dataset with an increase of 1.76% and 1.04% in mAP respectively while being 15.4 times faster. We showcase the effectiveness of our approach through experimental results in a real robot, demonstrating that the robot's ability to anticipate HOIs is key for better Human-Robot Interaction.", "keywords": "Human-Object Interaction;Human-Robot Collaboration;Human Intention", "primary_area": "", "supplementary_material": "/attachment/37cc679f6ae8e5db82e071e0cc64ee8665d114ea.zip", "author": "Esteve Valls Mascaro;Daniel Sliwowski;Dongheui Lee", "authorids": "~Esteve_Valls_Mascaro1;~Daniel_Sliwowski1;~Dongheui_Lee2", "gender": "M;M;F", "homepage": "https://sites.google.com/view/estevevallsmascaro;https://www.tuwien.at/en/etit/asl/team/daniel-sliwowski;https://www.tuwien.at/en/etit/ict/asl/team/dongheui-lee", "dblp": ";;96/5115", "google_scholar": "https://scholar.google.com/citations?hl=ca;;https://scholar.google.de/citations?hl=de", "orcid": ";;", "linkedin": "esteve-valls-mascaro/;;dongheui-lee-8791b856/", "or_profile": "~Esteve_Valls_Mascaro1;~Daniel_Sliwowski1;~Dongheui_Lee2", "aff": "Technische Universit\u00e4t Wien;Technische Universit\u00e4t Wien;Technische Universit\u00e4t Wien", "aff_domain": "tuwien.ac.at;tuwien.ac.at;tuwien.ac.at", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nmascaro2023hoiabot,\ntitle={{HOI}4{ABOT}: Human-Object Interaction Anticipation for Human Intention Reading Collaborative ro{BOT}s},\nauthor={Esteve Valls Mascaro and Daniel Sliwowski and Dongheui Lee},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rYZBdBytxBx}\n}", "github": "", "project": "", "reviewers": "pnXM;5Maz;AxBS;fhCH", "site": "https://openreview.net/forum?id=rYZBdBytxBx", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;4;4;3", "rating_avg": 5.5, "confidence_avg": 3.75, "replies_avg": 11, "authors#_avg": 3, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16160062106622521700&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Wien", "aff_unique_dep": "", "aff_unique_url": "https://www.tuwien.ac.at", "aff_unique_abbr": "TU Wien", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Austria" }, { "id": "rpWi4SYGXj", "title": "Grounding Complex Natural Language Commands for Temporal Tasks in Unseen Environments", "track": "main", "status": "Poster", "tldr": "a modular system that uses LLMs to ground natural language navigational commands for temporal tasks in novel household and city-scaled environments without retraining", "abstract": "Grounding navigational commands to linear temporal logic (LTL) leverages its unambiguous semantics for reasoning about long-horizon tasks and verifying the satisfaction of temporal constraints. Existing approaches require training data from the specific environment and landmarks that will be used in natural language to understand commands in those environments. We propose Lang2LTL, a modular system and a software package that leverages large language models (LLMs) to ground temporal navigational commands to LTL specifications in environments without prior language data. We comprehensively evaluate Lang2LTL for five well-defined generalization behaviors.\nLang2LTL demonstrates the state-of-the-art ability of a single model to ground navigational commands to diverse temporal specifications in 21 city-scaled environments. Finally, we demonstrate a physical robot using Lang2LTL can follow 52 semantically diverse navigational commands in two indoor environments.", "keywords": "language grounding;temporal reasoning;robot navigation;formal methods", "primary_area": "", "supplementary_material": "/attachment/a0d6fc3c805b2b5c732010b829c14e2442844475.zip", "author": "Jason Xinyu Liu;Ziyi Yang;Ifrah Idrees;Sam Liang;Benjamin Schornstein;Stefanie Tellex;Ankit Shah", "authorids": "~Jason_Xinyu_Liu1;~Ziyi_Yang3;~Ifrah_Idrees1;saml@princeton.edu;~Benjamin_Schornstein1;~Stefanie_Tellex1;~Ankit_Shah2", "gender": "M;;F;;M;F;M", "homepage": "https://jasonxyliu.github.io/;https://yzylmc.github.io;;;;https://h2r.cs.brown.edu/;http://www.ajshah.info", "dblp": ";;;;;50/3149;", "google_scholar": "https://scholar.google.com/citations?hl=en;sH5h3iAAAAAJ;;;;https://scholar.google.com.tw/citations?user=Pd8-ju0AAAAJ;KmJNnzIAAAAJ", "orcid": ";;;;;;", "linkedin": "https://linkedin.com/in/jason-liu-11406498/;;ifrah-idrees/;;https://www.linkedin.com/mwlite/in/ben-schornstein;;", "or_profile": "~Jason_Xinyu_Liu1;~Ziyi_Yang3;~Ifrah_Idrees1;saml@princeton.edu;~Benjamin_Schornstein1;~Stefanie_Tellex1;~Ankit_Shah2", "aff": "Brown University;Brown University;Brown University;;Brown University;, Brown University;Brown University", "aff_domain": "brown.edu;brown.edu;brown.edu;;brown.edu;cs.brown.edu;brown.edu", "position": "PhD student;PhD student;PhD student;;Undergrad student;Associate Professor;Postdoc", "bibtex": "@inproceedings{\nliu2023grounding,\ntitle={Grounding Complex Natural Language Commands for Temporal Tasks in Unseen Environments},\nauthor={Jason Xinyu Liu and Ziyi Yang and Ifrah Idrees and Sam Liang and Benjamin Schornstein and Stefanie Tellex and Ankit Shah},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rpWi4SYGXj}\n}", "github": "https://github.com/h2r/Lang2LTL", "project": "", "reviewers": "o4yc;y8dy;qzh3;UUsu", "site": "https://openreview.net/forum?id=rpWi4SYGXj", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "4;3;3;4", "rating_avg": 7.5, "confidence_avg": 3.5, "replies_avg": 13, "authors#_avg": 7, "corr_rating_confidence": -0.19245008972987526, "gs_citation": 46, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15737744330575752774&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Brown University", "aff_unique_dep": "", "aff_unique_url": "https://www.brown.edu", "aff_unique_abbr": "Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "rvh0vkwKUM", "title": "Predicting Routine Object Usage for Proactive Robot Assistance", "track": "main", "status": "Poster", "tldr": "We propose SLaTe-PRO, a model which can learn to anticipate user's needs from past observations and use them to provide proactive assistance, and an interactive clarification mechanism, which can further refine such predictions.", "abstract": "Proactivity in robot assistance refers to the robot's ability to anticipate user needs and perform assistive actions without explicit requests. This requires understanding user routines, predicting consistent activities, and actively seeking information to predict inconsistent behaviors. We propose SLaTe-PRO (Sequential Latent Temporal model for Predicting Routine Object usage), which improves upon prior state-of-the-art by combining object and user action information, and conditioning object usage predictions on past history. Additionally, we find some human behavior to be inherently stochastic and lacking in contextual cues that the robot can use for proactive assistance. To address such cases, we introduce an interactive query mechanism that can be used to ask queries about the user's intended activities and object use to improve prediction. We evaluate our approach on longitudinal data from three households, spanning 24 activity classes. SLaTe-PRO performance raises the F1 score metric to 0.57 without queries, and 0.60 with user queries, over a score of 0.43 from prior work. We additionally present a case study with a fully autonomous household robot.", "keywords": "Proactive Robot Assistance;User Routine Understanding;Interactive Clarification;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/1ca2e76cafa67c727c5dd47a9246f792c384c103.zip", "author": "Maithili Patel;Aswin Gururaj Prakash;Sonia Chernova", "authorids": "~Maithili_Patel1;~Aswin_Gururaj_Prakash1;~Sonia_Chernova2", "gender": "F;M;F", "homepage": "https://maithili.github.io;;https://www.cc.gatech.edu/~chernova/", "dblp": "334/4404;;27/1140", "google_scholar": ";;EYo_WkEAAAAJ", "orcid": "0000-0001-8730-9198;;0000-0001-6320-0825", "linkedin": "maithili/;aswin-gururaj-076a4414b/;", "or_profile": "~Maithili_Patel1;~Aswin_Gururaj_Prakash1;~Sonia_Chernova2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;MS student;Associate Professor", "bibtex": "@inproceedings{\npatel2023predicting,\ntitle={Predicting Routine Object Usage for Proactive Robot Assistance},\nauthor={Maithili Patel and Aswin Gururaj Prakash and Sonia Chernova},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rvh0vkwKUM}\n}", "github": "https://github.com/Maithili/SLaTe-PRO", "project": "", "reviewers": "ymJh;DVTZ;4vMT", "site": "https://openreview.net/forum?id=rvh0vkwKUM", "pdf_size": 0, "rating": "4;6;10", "confidence": "3;2;4", "rating_avg": 6.666666666666667, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.6546536707079772, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13652854370308140282&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "rxlokRzNWRq", "title": "ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting", "track": "main", "status": "Poster", "tldr": "ManiCast learns cost-aware human forecasts for collaborative manipulation tasks instead of simply predicting the most-likely estimate of future human motion.", "abstract": "Seamless human-robot manipulation in close proximity relies on accurate forecasts of human motion. While there has been significant progress in learning forecast models at scale, when applied to manipulation tasks, these models accrue high errors at critical transition points leading to degradation in downstream planning performance. Our key insight is that instead of predicting the most likely human motion, it is sufficient to produce forecasts that capture how future human motion would affect the cost of a robot's plan. We present ManiCast, a novel framework that learns cost-aware human forecasts and feeds them to a model predictive control planner to execute collaborative manipulation tasks. Our framework enables fluid, real-time interactions between a human and a 7-DoF robot arm across a number of real-world tasks such as reactive stirring, object handovers, and collaborative table setting. We evaluate both the motion forecasts and the end-to-end forecaster-planner system against a range of learned and heuristic baselines while additionally contributing new datasets. We release our code and datasets at https://portal-cornell.github.io/manicast/.", "keywords": "Collaborative Manipulation;Forecasting;Model Predictive Control", "primary_area": "", "supplementary_material": "/attachment/ed449d2d4c34814476af358b5c89ab82ad6dfcfa.zip", "author": "Kushal Kedia;Prithwish Dan;Atiksh Bhardwaj;Sanjiban Choudhury", "authorids": "~Kushal_Kedia1;~Prithwish_Dan1;~Atiksh_Bhardwaj1;~Sanjiban_Choudhury3", "gender": "M;M;M;M", "homepage": "https://kushal2000.github.io/;https://portfolio-pdan101.vercel.app/;;https://www.sanjibanchoudhury.com/", "dblp": ";;;135/8207", "google_scholar": ";;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";prithwish-dan/;atiksh-bhardwaj-b080ab241/;", "or_profile": "~Kushal_Kedia1;~Prithwish_Dan1;~Atiksh_Bhardwaj1;~Sanjiban_Choudhury3", "aff": "Cornell University;Department of Computer Science, Cornell University;Cornell University;Cornell University", "aff_domain": "cornell.edu;cs.cornell.edu;cornell.edu;cornell.edu", "position": "PhD student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nkedia2023manicast,\ntitle={ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting},\nauthor={Kushal Kedia and Prithwish Dan and Atiksh Bhardwaj and Sanjiban Choudhury},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=rxlokRzNWRq}\n}", "github": "", "project": "", "reviewers": "Z1jy;EFoV;M4Pj;1c4Z", "site": "https://openreview.net/forum?id=rxlokRzNWRq", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;3;3;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 14, "authors#_avg": 4, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13580258010437315378&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "sLhk0keeiseH", "title": "That Sounds Right: Auditory Self-Supervision for Dynamic Robot Manipulation", "track": "main", "status": "Poster", "tldr": "Learning contact-rich, dynamic manipulation behaviors using self-supervised techniques in audio.", "abstract": "Learning to produce contact-rich, dynamic behaviors from raw sensory data has been a longstanding challenge in robotics. Prominent approaches primarily focus on using visual and tactile sensing. However, pure vision often fails to capture high-frequency interaction, while current tactile sensors can be too delicate for large-scale data collection. In this work, we propose a data-centric approach to dynamic manipulation that uses an often ignored source of information -- sound. We first collect a dataset of 25k interaction-sound pairs across five dynamic tasks using contact microphones. Then, given this data, we leverage self-supervised learning to accelerate behavior prediction from sound. Our experiments indicate that this self-supervised `pretraining' is crucial to achieving high performance, with a 34.5% lower MSE than plain supervised learning and a 54.3% lower MSE over visual training. Importantly, we find that when asked to generate desired sound profiles, online rollouts of our models on a UR10 robot can produce dynamic behavior that achieves an average of 11.5% improvement over supervised learning on audio similarity metrics. Videos and audio data are best seen on our project website: aurl-anon.github.io", "keywords": "Dynamic manipulation;Self supervised learning;Audio", "primary_area": "", "supplementary_material": "/attachment/35c17d0edf77926ff06c89a84e196872bd28ccfb.zip", "author": "Abitha Thankaraj;Lerrel Pinto", "authorids": "~Abitha_Thankaraj1;~Lerrel_Pinto1", "gender": ";M", "homepage": ";https://www.lerrelpinto.com/", "dblp": ";168/8304", "google_scholar": ";pmVPj94AAAAJ", "orcid": ";", "linkedin": ";", "or_profile": "~Abitha_Thankaraj1;~Lerrel_Pinto1", "aff": ";New York University", "aff_domain": ";cs.nyu.edu", "position": ";Assistant Professor", "bibtex": "@inproceedings{\nthankaraj2023that,\ntitle={That Sounds Right: Auditory Self-Supervision for Dynamic Robot Manipulation},\nauthor={Abitha Thankaraj and Lerrel Pinto},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=sLhk0keeiseH}\n}", "github": "https://github.com/abitha-thankaraj/audio-robot-learning", "project": "", "reviewers": "KfWt;KaUb;UsSe;N4ak", "site": "https://openreview.net/forum?id=sLhk0keeiseH", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;3;3;3", "rating_avg": 6.0, "confidence_avg": 3.25, "replies_avg": 11, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8087066861749487428&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "New York University", "aff_unique_dep": "", "aff_unique_url": "https://www.nyu.edu", "aff_unique_abbr": "NYU", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "uJqxFjF1xWp", "title": "BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities", "track": "main", "status": "Poster", "tldr": "", "abstract": "Collaborative perception enables agents to share complementary perceptual information with nearby agents. This can significantly benefit the perception performance and alleviate the issues of single-view perception, such as occlusion and sparsity. Most proposed approaches mainly focus on single modality (especially LiDAR), and not fully exploit the superiority of multi-modal perception. We propose an collaborative perception paradigm, BM2CP, which employs LiDAR and camera to achieve efficient multi-modal perception. BM2CP utilizes LiDAR-guided modal fusion, cooperative depth generation and modality-guided intermediate fusion to acquire deep interactions between modalities and agents. Moreover, it is capable to cope with the special case that one of the sensors is unavailable. Extensive experiments validate that it outperforms the state-of-the-art methods with 50X lower communication volumes in real-world autonomous driving scenarios. Our code is available at supplementary materials.", "keywords": "Multi-Agent Perception;Multi-Modal Fusion;Vehicle-to-Everything (V2X) Application", "primary_area": "", "supplementary_material": "/attachment/e1030a24ff06ad9b88e4c052db12285b062fbef2.zip", "author": "Binyu Zhao;Wei ZHANG;Zhaonian Zou", "authorids": "~Binyu_Zhao1;~Wei_ZHANG89;~Zhaonian_Zou1", "gender": "M;M;M", "homepage": "https://dblp.org/pid/279/0619;http://homepage.hit.edu.cn/zhangweics?lang=zh;https://homepage.hit.edu.cn/zou", "dblp": "279/0619;10/4661-17.html;50/6683", "google_scholar": "4OC5Hj0AAAAJ;;", "orcid": "0000-0002-9564-6757;;0000-0001-9475-8944", "linkedin": ";;", "or_profile": "~Binyu_Zhao1;~Wei_ZHANG89;~Zhaonian_Zou1", "aff": "Harbin Institute of Technology;Harbin Institute of Technology;Harbin Institute of Technology", "aff_domain": "hit.edu.cn;hit.edu.cn;hit.edu.cn", "position": "PhD student;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nzhao2023bmcp,\ntitle={{BM}2{CP}: Efficient Collaborative Perception with Li{DAR}-Camera Modalities},\nauthor={Binyu Zhao and Wei ZHANG and Zhaonian Zou},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=uJqxFjF1xWp}\n}", "github": "", "project": "", "reviewers": "oA8p;TJuY;UhGW;bD4g", "site": "https://openreview.net/forum?id=uJqxFjF1xWp", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;1;4", "rating_avg": 6.0, "confidence_avg": 2.75, "replies_avg": 15, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14524293114897469005&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Harbin Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "http://www.hit.edu.cn/", "aff_unique_abbr": "HIT", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Harbin", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "uo937r5eTE", "title": "Robot Parkour Learning", "track": "main", "status": "Oral", "tldr": "We present a end-to-end vision-based robot parkour learning system for low-cost robots", "abstract": "Parkour is a grand challenge for legged locomotion that requires robots to overcome various obstacles rapidly in complex environments. Existing methods can generate either diverse but blind locomotion skills or vision-based but specialized skills by using reference animal data or complex rewards. However, \\textit{autonomous} parkour requires robots to learn generalizable skills that are both vision-based and diverse to perceive and react to various scenarios. In this work, we propose a system for learning a single end-to-end vision-based parkour policy of diverse parkour skills using a simple reward without any reference motion data. We develop a reinforcement learning method inspired by direct collocation to generate parkour skills, including climbing over high obstacles, leaping over large gaps, crawling beneath low barriers, squeezing through thin slits, and running. We distill these skills into a single vision-based parkour policy and transfer it to a quadrupedal robot using its egocentric depth camera. We demonstrate that our system can empower low-cost quadrupedal robots to autonomously select and execute appropriate parkour skills to traverse challenging environments in the real world. Project website: https://robot-parkour.github.io/", "keywords": "Agile Locomotion;End-to-End Vision-Based Control;Sim-to-Real", "primary_area": "", "supplementary_material": "/attachment/758efe1e118349f40270ec17a08c3bbe21f84f07.zip", "author": "Ziwen Zhuang;Zipeng Fu;Jianren Wang;Christopher G Atkeson;S\u00f6ren Schwertfeger;Chelsea Finn;Hang Zhao", "authorids": "~Ziwen_Zhuang1;~Zipeng_Fu1;~Jianren_Wang2;~Christopher_G_Atkeson1;~S\u00f6ren_Schwertfeger1;~Chelsea_Finn1;~Hang_Zhao1", "gender": "M;M;M;;;F;M", "homepage": "https://ziwenzhuang.github.io;https://zipengfu.github.io;https://www.jianrenw.com/;;;https://ai.stanford.edu/~cbfinn/;http://www.mit.edu/~hangzhao/", "dblp": ";245/1504;34/8491;;;131/1783;", "google_scholar": "GE8fpdwAAAAJ;wMcPTbEAAAAJ;qR4O45oAAAAJ;;;vfPE6hgAAAAJ;DmahiOYAAAAJ", "orcid": ";;;;;;", "linkedin": "leozhuang;zipengfu;;;;;", "or_profile": "~Ziwen_Zhuang1;~Zipeng_Fu1;~Jianren_Wang2;~Christopher_G_Atkeson1;~S\u00f6ren_Schwertfeger1;~Chelsea_Finn1;~Hang_Zhao1", "aff": "ShanghaiTech University;Stanford University;Carnegie Mellon University;;;Google;Tsinghua University", "aff_domain": "shanghaitech.edu.cn;stanford.edu;cmu.edu;;;google.com;tsinghua.edu.cn", "position": "MS student;PhD student;PhD student;;;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nzhuang2023robot,\ntitle={Robot Parkour Learning},\nauthor={Ziwen Zhuang and Zipeng Fu and Jianren Wang and Christopher G Atkeson and S{\\\"o}ren Schwertfeger and Chelsea Finn and Hang Zhao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=uo937r5eTE}\n}", "github": "https://github.com/ZiwenZhuang/parkour", "project": "", "reviewers": "u4ib;pRuB;PpdX;rf5K", "site": "https://openreview.net/forum?id=uo937r5eTE", "pdf_size": 0, "rating": "6;10;10;10", "confidence": "4;5;5;5", "rating_avg": 9.0, "confidence_avg": 4.75, "replies_avg": 16, "authors#_avg": 7, "corr_rating_confidence": 1.0, "gs_citation": 195, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9357736131342774978&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;4", "aff_unique_norm": "ShanghaiTech University;Stanford University;Carnegie Mellon University;Google;Tsinghua University", "aff_unique_dep": ";;;Google;", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.stanford.edu;https://www.cmu.edu;https://www.google.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "ShanghaiTech;Stanford;CMU;Google;THU", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Stanford;Mountain View", "aff_country_unique_index": "0;1;1;1;0", "aff_country_unique": "China;United States" }, { "id": "veGdf4L4Xz", "title": "KITE: Keypoint-Conditioned Policies for Semantic Manipulation", "track": "main", "status": "Poster", "tldr": "We present a framework for semantic manipulation leveraging keypoint-based grounding and a library of skills for precise execution.", "abstract": "While natural language offers a convenient shared interface for humans and robots, enabling robots to interpret and follow language commands remains a longstanding challenge in manipulation. A crucial step to realizing a performant instruction-following robot is achieving semantic manipulation \u2013 where a robot interprets language at different specificities, from high-level instructions like \"Pick up the stuffed animal\" to more detailed inputs like \"Grab the left ear of the elephant.\" To tackle this, we propose Keypoints + Instructions to Execution, a two-step framework for semantic manipulation which attends to both scene semantics (distinguishing between different objects in a visual scene) and object semantics (precisely localizing different parts within an object instance). KITE first grounds an input instruction in a visual scene through 2D image keypoints, providing a highly accurate object-centric bias for downstream action inference. Provided an RGB-D scene observation, KITE then executes a learned keypoint-conditioned skill to carry out the instruction. The combined precision of keypoints and parameterized skills enables fine-grained manipulation with generalization to scene and object variations. Empirically, we demonstrate KITE in 3 real-world environments: long-horizon 6-DoF tabletop manipulation, semantic grasping, and a high-precision coffee-making task. In these settings, KITE achieves a 75%, 70%, and 71% overall success rate for instruction-following, respectively. KITE outperforms frameworks that opt for pre-trained visual language models over keypoint-based grounding, or omit skills in favor of end-to-end visuomotor control, all while being trained from fewer or comparable amounts of demonstrations. Supplementary material, datasets, code, and videos can be found on our website: https://tinyurl.com/kite-site.", "keywords": "Semantic Manipulation;Language Grounding;Keypoint Perception", "primary_area": "", "supplementary_material": "/attachment/74a4e6ac0bc120cbc8bfb2ad509ad3f2b7b77b9d.zip", "author": "Priya Sundaresan;Suneel Belkhale;Dorsa Sadigh;Jeannette Bohg", "authorids": "~Priya_Sundaresan1;~Suneel_Belkhale1;~Dorsa_Sadigh1;~Jeannette_Bohg1", "gender": "F;M;F;", "homepage": ";https://github.com/suneelbelkhale;https://dorsa.fyi/;https://web.stanford.edu/~bohg/", "dblp": ";236/5069;117/3174;52/7377", "google_scholar": "7SUquR4AAAAJ;;ZaJEZpYAAAAJ;rjnJnEkAAAAJ", "orcid": ";0000-0002-3963-7987;;0000-0002-4921-7193", "linkedin": ";suneel-b-032b1a101/;;", "or_profile": "~Priya_Sundaresan1;~Suneel_Belkhale1;~Dorsa_Sadigh1;~Jeannette_Bohg1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nsundaresan2023kite,\ntitle={{KITE}: Keypoint-Conditioned Policies for Semantic Manipulation},\nauthor={Priya Sundaresan and Suneel Belkhale and Dorsa Sadigh and Jeannette Bohg},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=veGdf4L4Xz}\n}", "github": "http://tinyurl.com/kite-site", "project": "", "reviewers": "C4rM;nGAo;pCg6", "site": "https://openreview.net/forum?id=veGdf4L4Xz", "pdf_size": 0, "rating": "6;6;6", "confidence": "4;4;3", "rating_avg": 6.0, "confidence_avg": 3.6666666666666665, "replies_avg": 13, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6234604369087074746&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "vsEWu6mMUhB", "title": "Semantic Mechanical Search with Large Vision and Language Models", "track": "main", "status": "Poster", "tldr": "Creating semantic distributions for mechanical search from LLM features", "abstract": "Moving objects to find a fully-occluded target object, known as mechanical search, is a challenging problem in robotics. As objects are often organized semantically, we conjecture that semantic information about object relationships can facilitate mechanical search and reduce search time. Large pretrained vision and language models (VLMs and LLMs) have shown promise in generalizing to uncommon objects and previously unseen real-world environments. In this work, we propose a novel framework called Semantic Mechanical Search (SMS). SMS conducts scene understanding and generates a semantic occupancy distribution explicitly using LLMs. Compared to methods that rely on visual similarities offered by CLIP embeddings, SMS leverages the deep reasoning capabilities of LLMs. Unlike prior work that uses VLMs and LLMs as end-to-end planners, which may not integrate well with specialized geometric planners, SMS can serve as a plug-in semantic module for downstream manipulation or navigation policies. For mechanical search in closed-world settings such as shelves, we compare with a geometric-based planner and show that SMS improves mechanical search performance by 24% across the pharmacy, kitchen, and office domains in simulation and 47.1% in physical experiments. For open-world real environments, SMS can produce better semantic distributions compared to CLIP-based methods, with the potential to be integrated with downstream navigation policies to improve object navigation tasks. Code, data, videos, and Appendix are available here.", "keywords": "Vision and Language Models in Robotics;Mechanical Search;Object search", "primary_area": "", "supplementary_material": "/attachment/64edea873b29616d1bcfe881d394f7d26efc9475.zip", "author": "Satvik Sharma;Huang Huang;Kaushik Shivakumar;Lawrence Yunliang Chen;Ryan Hoque;brian ichter;Ken Goldberg", "authorids": "~Satvik_Sharma1;~Huang_Huang1;~Kaushik_Shivakumar1;~Lawrence_Yunliang_Chen1;~Ryan_Hoque1;~brian_ichter1;~Ken_Goldberg1", "gender": "M;;M;M;M;;M", "homepage": ";https://sites.google.com/site/huanghuang9729/home;https://kmindspark.github.io/;https://yunliangchen.github.io/;https://ryanhoque.github.io;;http://goldberg.berkeley.edu/", "dblp": ";;;;250/9457;;g/KennethYGoldberg", "google_scholar": "0wZN6hEAAAAJ;;2TTqpGQAAAAJ;;ywv6tDUAAAAJ;-w5DuHgAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ", "orcid": ";;;;;;0000-0001-6747-9499", "linkedin": ";;kaushik-shivakumar/;lawrence-yunliang-chen/;https://linkedin.com/in/ryanhoque;;goldbergken/", "or_profile": "~Satvik_Sharma1;~Huang_Huang1;~Kaushik_Shivakumar1;~Lawrence_Yunliang_Chen1;~Ryan_Hoque1;~brian_ichter1;~Ken_Goldberg1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Google;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;google.com;berkeley.edu", "position": "Undergrad student;PhD student;MS student;PhD student;PhD student;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nsharma2023semantic,\ntitle={Semantic Mechanical Search with Large Vision and Language Models},\nauthor={Satvik Sharma and Huang Huang and Kaushik Shivakumar and Lawrence Yunliang Chen and Ryan Hoque and brian ichter and Ken Goldberg},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=vsEWu6mMUhB}\n}", "github": "", "project": "", "reviewers": "jFnz;5M8F;kaAs;D6LF", "site": "https://openreview.net/forum?id=vsEWu6mMUhB", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;5;4;4", "rating_avg": 6.5, "confidence_avg": 4.25, "replies_avg": 21, "authors#_avg": 7, "corr_rating_confidence": -0.13245323570650439, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14852414564257053045&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;0;0;1;0", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "w5ONmpgnfG", "title": "One-Shot Imitation Learning: A Pose Estimation Perspective", "track": "main", "status": "Poster", "tldr": "We study one-shot imitation learning from the perspective of unseen object pose estimation, providing valuable insights as well as showing this framework's potential in real world robotics tasks.", "abstract": "In this paper, we study imitation learning under the challenging setting of: (1) only a single demonstration, (2) no further data collection, and (3) no prior task or object knowledge. We show how, with these constraints, imitation learning can be formulated as a combination of trajectory transfer and unseen object pose estimation. To explore this idea, we provide an in-depth study on how state-of-the-art unseen object pose estimators perform for one-shot imitation learning on ten real-world tasks, and we take a deep dive into the effects that camera calibration, pose estimation error, and spatial generalisation have on task success rates. For videos, please visit www.robot-learning.uk/pose-estimation-perspective.", "keywords": "One-Shot Imitation Learning;Unseen Object Pose Estimation;Robot Manipulation", "primary_area": "", "supplementary_material": "/attachment/f9d3b1c2d6001e48efa6eefc170d72a7a48657e2.zip", "author": "Pietro Vitiello;Kamil Dreczkowski;Edward Johns", "authorids": "~Pietro_Vitiello1;~Kamil_Dreczkowski1;~Edward_Johns1", "gender": "M;M;M", "homepage": "https://www.pietrovitiello.com/;;https://www.robot-learning.uk", "dblp": ";;68/9968", "google_scholar": ";5hrKfnwAAAAJ;https://scholar.google.co.uk/citations?user=sMIUkiQAAAAJ", "orcid": ";0000-0001-8278-6550;0000-0002-8914-8786", "linkedin": "pietro-vitiello-737167196/;kamil-dreczkowski/;https://uk.linkedin.com/in/edward-johns-1b24845a", "or_profile": "~Pietro_Vitiello1;~Kamil_Dreczkowski1;~Edward_Johns1", "aff": "Imperial College London;Imperial College London;Imperial College London", "aff_domain": "ic.ac.uk;ic.ac.uk;imperial.ac.uk", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nvitiello2023oneshot,\ntitle={One-Shot Imitation Learning: A Pose Estimation Perspective},\nauthor={Pietro Vitiello and Kamil Dreczkowski and Edward Johns},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=w5ONmpgnfG}\n}", "github": "", "project": "", "reviewers": "DHE8;NRUS;Sjn1;cUKd", "site": "https://openreview.net/forum?id=w5ONmpgnfG", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "3;3;5;4", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 17, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16602012030898784502&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "wH23nZpVTF6", "title": "DEFT: Dexterous Fine-Tuning for Hand Policies", "track": "main", "status": "Poster", "tldr": "We present DEFT, a novel approach to learn complex dexterous manipulation tasks, leveragin human priors and finetuning via practice in the real world.", "abstract": "Dexterity is often seen as a cornerstone of complex manipulation. Humans are able to perform a host of skills with their hands, from making food to operating tools. In this paper, we investigate these challenges, especially in the case of soft, deformable objects as well as complex, relatively long-horizon tasks. Although, learning such behaviors from scratch can be data inefficient. To circumvent this, we propose a novel approach, DEFT (DExterous Fine-Tuning for Hand Policies), that leverages human-driven priors, which are executed directly in the real world. In order to improve upon these priors, DEFT involves an efficient online optimization procedure. With the integration of human-based learning and online fine-tuning, coupled with a soft robotic hand, DEFT demonstrates success across various tasks, establishing a robust, data-efficient pathway toward general dexterous manipulation. Please see our website at https://dexterousfinetuning.github.io for video results.", "keywords": "Dexterous Manipulation;Reinforcement Learning;Learning from Videos", "primary_area": "", "supplementary_material": "/attachment/6442ba6c6e1c2d3f6aaff9a8fbfca7e217b26d95.zip", "author": "Aditya Kannan;Kenneth Shaw;Shikhar Bahl;Pragna Mannam;Deepak Pathak", "authorids": "~Aditya_Kannan1;~Kenneth_Shaw1;~Shikhar_Bahl1;~Pragna_Mannam1;~Deepak_Pathak1", "gender": ";M;;F;M", "homepage": "https://adityak77.github.io;https://www.linkedin.com/in/kenny-shaw/;https://www.cs.cmu.edu/~sbahl2/;https://www.ri.cmu.edu/ri-people/pragna-mannam/;https://www.cs.cmu.edu/~dpathak/", "dblp": ";;223/4390;;155/9860", "google_scholar": ";;bdHgGgEAAAAJ;1XZy004AAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;;0000-0001-6428-6050;", "linkedin": ";kenny-shaw/;;pmannam;pathak22/", "or_profile": "~Aditya_Kannan1;~Kenneth_Shaw1;~Shikhar_Bahl1;~Pragna_Mannam1;~Deepak_Pathak1", "aff": "School of Computer Science, Carnegie Mellon University;Carnegie Mellon University;Meta Facebook;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;cmu.edu;meta.com;cmu.edu;cmu.edu", "position": "MS student;MS student;Researcher;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkannan2023deft,\ntitle={{DEFT}: Dexterous Fine-Tuning for Hand Policies},\nauthor={Aditya Kannan and Kenneth Shaw and Shikhar Bahl and Pragna Mannam and Deepak Pathak},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=wH23nZpVTF6}\n}", "github": "", "project": "", "reviewers": "Tfey;g2GV;1dz2", "site": "https://openreview.net/forum?id=wH23nZpVTF6", "pdf_size": 0, "rating": "6;6;6", "confidence": "3;3;4", "rating_avg": 6.0, "confidence_avg": 3.3333333333333335, "replies_avg": 12, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": -1, "gs_cited_by_link": "", "gs_version_total": -1, "aff_unique_index": "0;0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Meta", "aff_unique_dep": "School of Computer Science;Meta Platforms, Inc.", "aff_unique_url": "https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "CMU;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "wMpOMO0Ss7a", "title": "SayPlan: Grounding Large Language Models using 3D Scene Graphs for Scalable Robot Task Planning", "track": "main", "status": "Oral", "tldr": "Combining large language models and 3D Scene Graphs for scalable robotic task planning in expansive multi-floor and multi-room environments, using semantic search and iterative re-planning.", "abstract": "Large language models (LLMs) have demonstrated impressive results in developing generalist planning agents for diverse tasks. However, grounding these plans in expansive, multi-floor, and multi-room environments presents a significant challenge for robotics. We introduce SayPlan, a scalable approach to LLM-based, large-scale task planning for robotics using 3D scene graph (3DSG) representations. To ensure the scalability of our approach, we: (1) exploit the hierarchical nature of 3DSGs to allow LLMs to conduct a \"semantic search\" for task-relevant subgraphs from a smaller, collapsed representation of the full graph; (2) reduce the planning horizon for the LLM by integrating a classical path planner and (3) introduce an \"iterative replanning\" pipeline that refines the initial plan using feedback from a scene graph simulator, correcting infeasible actions and avoiding planning failures. We evaluate our approach on two large-scale environments spanning up to 3 floors and 36 rooms with 140 assets and objects and show that our approach is capable of grounding large-scale, long-horizon task plans from abstract, and natural language instruction for a mobile manipulator robot to execute. We provide real robot video demonstrations on our project page https://sayplan.github.io.", "keywords": "robot task planning;large language models;semantic search;LLM-based planning;3D scene graphs", "primary_area": "", "supplementary_material": "/attachment/899a89a8e3b5eb4cc26e9e22b63943c1e26ec7d0.zip", "author": "Krishan Rana;Jesse Haviland;Sourav Garg;Jad Abou-Chakra;Ian Reid;Niko Suenderhauf", "authorids": "~Krishan_Rana1;~Jesse_Haviland1;~Sourav_Garg1;~Jad_Abou-Chakra1;~Ian_Reid1;~Niko_Suenderhauf1", "gender": "M;;M;M;M;M", "homepage": "https://krishanrana.github.io/;;https://oravus.github.io/;;;http://nikosuenderhauf.info", "dblp": "70/4142;;142/0073;;r/IanDReid1;", "google_scholar": "-hYjPxsAAAAJ;;oVS3HHIAAAAJ;;https://scholar.google.com.au/citations?user=ATkNLcQAAAAJ;https://scholar.google.com.au/citations?user=WnKjfFEAAAAJ", "orcid": "0000-0002-9028-9295;0000-0002-1227-7459;0000-0001-6068-3307;0000-0002-9122-3132;0000-0001-7790-6423;", "linkedin": "krishanrana/;;gargsourav/;;;nikosuenderhauf/", "or_profile": "~Krishan_Rana1;~Jesse_Haviland1;~Sourav_Garg1;~Jad_Abou-Chakra1;~Ian_Reid1;~Niko_Suenderhauf1", "aff": "Queensland University of Technology;Queensland University of Technology;Queensland University of Technology;Queensland University of Technology;University of Adelaide;Queensland University of Technology", "aff_domain": "qut.edu.au;qut.edu.au;qut.edu.au;qut.edu.au;adelaide.edu.au;qut.edu.au", "position": "Postdoc;Postdoc;Postdoc;PhD student;Professor;Full Professor", "bibtex": "@inproceedings{\nrana2023sayplan,\ntitle={SayPlan: Grounding Large Language Models using 3D Scene Graphs for Scalable Robot Task Planning},\nauthor={Krishan Rana and Jesse Haviland and Sourav Garg and Jad Abou-Chakra and Ian Reid and Niko Suenderhauf},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=wMpOMO0Ss7a}\n}", "github": "", "project": "", "reviewers": "QfzX;xLeS;ZPG6;UUK5", "site": "https://openreview.net/forum?id=wMpOMO0Ss7a", "pdf_size": 0, "rating": "10;10;10;10", "confidence": "4;4;5;4", "rating_avg": 10.0, "confidence_avg": 4.25, "replies_avg": 8, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 315, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3875993055660234789&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Queensland University of Technology;University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.qut.edu.au;https://www.adelaide.edu.au", "aff_unique_abbr": "QUT;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Australia" }, { "id": "xJ7XL5Wt8iN", "title": "CLUE: Calibrated Latent Guidance for Offline Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Offline reinforcement learning (RL) aims to learn an optimal policy from pre-collected and labeled datasets, which eliminates the time-consuming data collection in online RL. However, offline RL still bears a large burden of specifying/handcrafting extrinsic rewards for each transition in the offline data. As a remedy for the labor-intensive labeling, we propose to endow offline RL tasks with a few expert data and utilize the limited expert data to drive intrinsic rewards, thus eliminating the need for extrinsic rewards. To achieve that, we introduce Calibrated Latent gUidancE (CLUE), which utilizes a conditional variational auto-encoder to learn a latent space such that intrinsic rewards can be directly qualified over the latent space. CLUE's key idea is to align the intrinsic rewards consistent with the expert intention via enforcing the embeddings of expert data to a calibrated contextual representation. We instantiate the expert-driven intrinsic rewards in sparse-reward offline RL tasks, offline imitation learning (IL) tasks, and unsupervised offline RL tasks. Empirically, we find that CLUE can effectively improve the sparse-reward offline RL performance, outperform the state-of-the-art offline IL baselines, and discover diverse skills from static reward-free offline data.", "keywords": "Offline Reinforcement Learning;Intrinsic Rewards;Learning Skills", "primary_area": "", "supplementary_material": "/attachment/a5248fed5bddc3b524d2ea69e219b9b97ae15bc3.zip", "author": "Jinxin Liu;Lipeng Zu;Li He;Donglin Wang", "authorids": "~Jinxin_Liu1;~Lipeng_Zu1;~Li_He3;~Donglin_Wang1", "gender": ";;;M", "homepage": ";;;https://milab.westlake.edu.cn/", "dblp": ";;;", "google_scholar": ";;MKMKMrIAAAAJ;https://scholar.google.ca/citations?user=-fo6wdwAAAAJ", "orcid": ";;;0000-0002-8188-3735", "linkedin": ";;https://www.linkedin.cn/incareer/in/%E7%AB%8B-%E4%BD%95-94a95823a;", "or_profile": "~Jinxin_Liu1;~Lipeng_Zu1;~Li_He3;~Donglin_Wang1", "aff": ";;Westlake University;Westlake University", "aff_domain": ";;westlake.edu.cn;westlake.edu.cn", "position": ";;Research Assistant;Associate Professor", "bibtex": "@inproceedings{\nliu2023clue,\ntitle={{CLUE}: Calibrated Latent Guidance for Offline Reinforcement Learning},\nauthor={Jinxin Liu and Lipeng Zu and Li He and Donglin Wang},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=xJ7XL5Wt8iN}\n}", "github": "", "project": "", "reviewers": "GEyR;ashV;pVh3;CYF2", "site": "https://openreview.net/forum?id=xJ7XL5Wt8iN", "pdf_size": 0, "rating": "4;6;10;10", "confidence": "5;3;4;2", "rating_avg": 7.5, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.6024640760767093, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4061857923124624860&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0", "aff_unique_norm": "Westlake University", "aff_unique_dep": "", "aff_unique_url": "https://www.westlake.edu.cn", "aff_unique_abbr": "WU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "xQx1O7WXSA", "title": "Expansive Latent Planning for Sparse Reward Offline Reinforcement Learning", "track": "main", "status": "Oral", "tldr": "", "abstract": "Sampling-based motion planning algorithms excel at searching global solution paths in geometrically complex settings. However, classical approaches, such as RRT, are difficult to scale beyond low-dimensional search spaces and rely on privileged knowledge e.g. about collision detection and underlying state distances. In this work, we take a step towards the integration of sampling-based planning into the reinforcement learning framework to solve sparse-reward control tasks from high-dimensional inputs. Our method, called VELAP, determines sequences of waypoints through sampling-based exploration in a learned state embedding. Unlike other sampling-based techniques, we iteratively expand a tree-based memory of visited latent areas, which is leveraged to explore a larger portion of the latent space for a given number of search iterations. We demonstrate state-of-the-art results in learning control from offline data in the context of vision-based manipulation under sparse reward feedback. Our method extends the set of available planning tools in model-based reinforcement learning by adding a latent planner that searches globally for feasible paths instead of being bound to a fixed prediction horizon.", "keywords": "model-based reinforcement learning;planning;robot manipulation", "primary_area": "", "supplementary_material": "/attachment/522307a7bc902f252aa0b6b995c7a83d0ef1d745.zip", "author": "Robert Gieselmann;Florian T. Pokorny", "authorids": "~Robert_Gieselmann1;~Florian_T._Pokorny1", "gender": ";", "homepage": "https://krobg.github.io/;", "dblp": "231/5269;", "google_scholar": "i-LuXQkAAAAJ;", "orcid": ";", "linkedin": ";", "or_profile": "~Robert_Gieselmann1;~Florian_T._Pokorny1", "aff": "KTH Royal Institute of Technology, Stockholm, Sweden;", "aff_domain": "kth.se;", "position": "PhD student;", "bibtex": "@inproceedings{\ngieselmann2023expansive,\ntitle={Expansive Latent Planning for Sparse Reward Offline Reinforcement Learning},\nauthor={Robert Gieselmann and Florian T. Pokorny},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=xQx1O7WXSA}\n}", "github": "", "project": "", "reviewers": "xuPo;6i5h;foWD;dVQH", "site": "https://openreview.net/forum?id=xQx1O7WXSA", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;4;4;4", "rating_avg": 8.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11307084644391645723&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "KTH Royal Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.kth.se", "aff_unique_abbr": "KTH", "aff_campus_unique_index": "0", "aff_campus_unique": "Stockholm", "aff_country_unique_index": "0", "aff_country_unique": "Sweden" }, { "id": "xgrZkRHliXR", "title": "Learning to Design and Use Tools for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "We propose a framework for robots to design tools and perform manipulation with them based on the task at hand.", "abstract": "When limited by their own morphologies, humans and some species of animals have the remarkable ability to use objects from the environment toward accomplishing otherwise impossible tasks. Robots might similarly unlock a range of additional capabilities through tool use. Recent techniques for jointly optimizing morphology and control via deep learning are effective at designing locomotion agents. But while outputting a single morphology makes sense for locomotion, manipulation involves a variety of strategies depending on the task goals at hand. A manipulation agent must be capable of rapidly prototyping specialized tools for different goals. Therefore, we propose learning a designer policy, rather than a single design. A designer policy is conditioned on task information and outputs a tool design that helps solve the task. A design-conditioned controller policy can then perform manipulation using these tools. In this work, we take a step towards this goal by introducing a reinforcement learning framework for jointly learning these policies. Through simulated manipulation tasks, we show that this framework is more sample efficient than prior methods in multi-goal or multi-variant settings, can perform zero-shot interpolation or fine-tuning to tackle previously unseen goals, and allows tradeoffs between the complexity of design and control policies under practical constraints. Finally, we deploy our learned policies onto a real robot. Please see our supplementary video and website at https://robotic-tool-design.github.io/ for visualizations.", "keywords": "tool use;manipulation;design", "primary_area": "", "supplementary_material": "/attachment/64e31a684420496287c663a1151ab1a696b617b7.zip", "author": "Ziang Liu;Stephen Tian;Michelle Guo;Karen Liu;Jiajun Wu", "authorids": "~Ziang_Liu2;~Stephen_Tian1;~Michelle_Guo1;~Karen_Liu1;~Jiajun_Wu1", "gender": ";M;F;;M", "homepage": ";http://s-tian.github.io;https://shellguo.com;https://cs.stanford.edu/~karenliu;https://jiajunwu.com", "dblp": ";237/9780;185/0671;;117/4768", "google_scholar": ";l19pn2sAAAAJ;lyjjpNMAAAAJ;i28fU0MAAAAJ;2efgcS0AAAAJ", "orcid": ";;0000-0002-6574-6669;0000-0001-5926-0905;0000-0002-4176-343X", "linkedin": ";;;;jiajunwu/", "or_profile": "~Ziang_Liu2;~Stephen_Tian1;~Michelle_Guo1;~Karen_Liu1;~Jiajun_Wu1", "aff": ";Stanford University;Computer Science Department, Stanford University;Computer Science Department, Stanford University;Stanford University", "aff_domain": ";stanford.edu;cs.stanford.edu;cs.stanford.edu;stanford.edu", "position": ";PhD student;PhD student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nliu2023learning,\ntitle={Learning to Design and Use Tools for Robotic Manipulation},\nauthor={Ziang Liu and Stephen Tian and Michelle Guo and Karen Liu and Jiajun Wu},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=xgrZkRHliXR}\n}", "github": "", "project": "", "reviewers": "D5az;2UAy;RS5q;1xsZ", "site": "https://openreview.net/forum?id=xgrZkRHliXR", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "5;5;4;5", "rating_avg": 6.5, "confidence_avg": 4.75, "replies_avg": 18, "authors#_avg": 5, "corr_rating_confidence": 0.13245323570650439, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9970939950849948767&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "yGkqN4hqrJ", "title": "Fine-Tuning Generative Models as an Inference Method for Robotic Tasks", "track": "main", "status": "Poster", "tldr": "A simple and general method for quickly fine-tuning generative models to novel observations in robotic tasks.", "abstract": "Adaptable models could greatly benefit robotic agents operating in the real world, allowing them to deal with novel and varying conditions. While approaches such as Bayesian inference are well-studied frameworks for adapting models to evidence, we build on recent advances in deep generative models which have greatly affected many areas of robotics. Harnessing modern GPU acceleration, we investigate how to quickly adapt the sample generation of neural network models to observations in robotic tasks. We propose a simple and general method that is applicable to various deep generative models and robotic environments. The key idea is to quickly fine-tune the model by fitting it to generated samples matching the observed evidence, using the cross-entropy method. We show that our method can be applied to both autoregressive models and variational autoencoders, and demonstrate its usability in object shape inference from grasping, inverse kinematics calculation, and point cloud completion.", "keywords": "robotic learning;fine-tuning;generative models", "primary_area": "", "supplementary_material": "/attachment/ea693bff805c18d8f11891b4b2216d243e875cd2.zip", "author": "Orr Krupnik;Elisei Shafer;Tom Jurgenson;Aviv Tamar", "authorids": "~Orr_Krupnik1;~Elisei_Shafer1;~Tom_Jurgenson1;~Aviv_Tamar2", "gender": "M;;;M", "homepage": "https://www.orrkrup.com;;;https://avivt.github.io/avivt/", "dblp": "234/8503;;https://dblp.uni-trier.de/pers/hd/j/Jurgenson:Tom;49/10622", "google_scholar": "E2lnAzIAAAAJ;;1YjIvioAAAAJ;https://scholar.google.co.il/citations?user=kppa2vgAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Orr_Krupnik1;~Elisei_Shafer1;~Tom_Jurgenson1;~Aviv_Tamar2", "aff": "Technion - Israel Institute of Technology, Technion - Israel Institute of Technology;;Technion;Technion, Technion", "aff_domain": "campus.technion.ac.il;;technion.ac.il;technion.ac.il", "position": "PhD student;;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nkrupnik2023finetuning,\ntitle={Fine-Tuning Generative Models as an Inference Method for Robotic Tasks},\nauthor={Orr Krupnik and Elisei Shafer and Tom Jurgenson and Aviv Tamar},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=yGkqN4hqrJ}\n}", "github": "https://github.com/orrkrup/mace/", "project": "", "reviewers": "KQ64;LQd2;EPGP;KLCv", "site": "https://openreview.net/forum?id=yGkqN4hqrJ", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;3;3;4", "rating_avg": 5.5, "confidence_avg": 3.5, "replies_avg": 7, "authors#_avg": 4, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16032895423879284669&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technion - Israel Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.technion.ac.il/en/", "aff_unique_abbr": "Technion", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Israel" }, { "id": "yHlUVHWnBN", "title": "SCONE: A Food Scooping Robot Learning Framework with Active Perception", "track": "main", "status": "Poster", "tldr": "A food scooping robot learning framework with active perception to improve food property-wise generalization.", "abstract": "Effectively scooping food items poses a substantial challenge for current robotic systems, due to the intricate states and diverse physical properties of food. To address this challenge, we believe in the importance of encoding food items into meaningful representations for effective food scooping. However, the distinctive properties of food items, including deformability, fragility, fluidity, or granularity, pose significant challenges for existing representations. In this paper, we investigate the potential of active perception for learning meaningful food representations in an implicit manner. To this end, we present SCONE, a food-scooping robot learning framework that leverages representations gained from active perception to facilitate food scooping policy learning. SCONE comprises two crucial encoding components: the interactive encoder and the state retrieval module. Through the encoding process, SCONE is capable of capturing properties of food items and vital state characteristics. In our real-world scooping experiments, SCONE excels with a 71% success rate when tasked with 6 previously unseen food items across three different difficulty levels, surpassing state-of-the\u0002art methods. This enhanced performance underscores SCONE\u2019s stability, as all food items consistently achieve task success rates exceeding 50%. Additionally, SCONE\u2019s impressive capacity to accommodate diverse initial states enables it to precisely evaluate the present condition of the food, resulting in a compelling scooping success rate. For further information, please visit our website: https://sites.google.com/view/corlscone/home.", "keywords": "Food Manipulation;Robot Scooping;Active Perception", "primary_area": "", "supplementary_material": "/attachment/0265ae2659c133f970fd79898533e702820adbc4.zip", "author": "Yen-Ling Tai;Yu Chien Chiu;Yu-Wei Chao;Yi-Ting Chen", "authorids": "~Yen-Ling_Tai1;~Yu_Chien_Chiu1;~Yu-Wei_Chao1;~Yi-Ting_Chen2", "gender": "F;F;M;M", "homepage": "https://github.com/YLingT;https://github.com/Scoopyhead;http://www-personal.umich.edu/~ywchao/;https://sites.google.com/site/yitingchen0524/", "dblp": "287/8648;;44/10700;12/5268-1", "google_scholar": ";;48Y9F-YAAAAJ;8tRH7RMAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Yen-Ling_Tai1;~Yu_Chien_Chiu1;~Yu-Wei_Chao1;~Yi-Ting_Chen2", "aff": "National Yang Ming Chiao Tung University;National Yang Ming Chiao Tung University;NVIDIA;National Yang Ming Chiao Tung University", "aff_domain": "cs.nycu.edu.tw;nycu.edu.tw;nvidia.com;nycu.edu.tw", "position": "MS student;Undergrad student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\ntai2023scone,\ntitle={{SCONE}: A Food Scooping Robot Learning Framework with Active Perception},\nauthor={Yen-Ling Tai and Yu Chien Chiu and Yu-Wei Chao and Yi-Ting Chen},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=yHlUVHWnBN}\n}", "github": "", "project": "", "reviewers": "vCX4;S1uK;VGVQ;BzGd", "site": "https://openreview.net/forum?id=yHlUVHWnBN", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;5;4;5", "rating_avg": 8.0, "confidence_avg": 4.5, "replies_avg": 20, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11188950082634820129&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "National Yang Ming Chiao Tung University;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.nycu.edu.tw;https://www.nvidia.com", "aff_unique_abbr": "NYCU;NVIDIA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "ycy47ZX0Oc", "title": "Leveraging 3D Reconstruction for Mechanical Search on Cluttered Shelves", "track": "main", "status": "Poster", "tldr": "We propose a 3D reconstruction-based meachnical search algorithm on cluttered shelves.", "abstract": "Finding and grasping a target object on a cluttered shelf, especially when the target is occluded by other unknown objects and initially invisible, remains a significant challenge in robotic manipulation. While there have been advances in finding the target object by rearranging surrounding objects using specialized tools, developing algorithms that work with standard robot grippers remains an unresolved issue. In this paper, we introduce a novel framework for finding and grasping the target object using a standard gripper, employing pushing and pick and-place actions. To achieve this, we introduce two indicator functions: (i) an existence function, determining the potential presence of the target, and (ii) a graspability function, assessing the feasibility of grasping the identified target. We then formulate a model-based optimal control problem. The core component of our approach involves leveraging a 3D recognition model, enabling efficient estimation of the proposed indicator functions and their associated dynamics models. Our method succeeds in finding and grasping the target object using a standard robot gripper in both simulations and real-world settings. In particular, we demonstrate the adaptability and robustness of our method in the presence of noise in real-world vision sensor data. The code for our framework is available at https://github.com/seungyeon-k/Search-for-Grasp-public.", "keywords": "Mechanical search;Object rearrangement;Prehensile and Non-prehensile manipulation", "primary_area": "", "supplementary_material": "/attachment/b7ba55573106baa924a1742ebd7e38075b54a670.zip", "author": "Seungyeon Kim;Young Hun Kim;Yonghyeon Lee;Frank C. Park", "authorids": "~Seungyeon_Kim2;~Young_Hun_Kim1;~Yonghyeon_Lee2;~Frank_C._Park1", "gender": "M;M;M;M", "homepage": "https://seungyeon-k.github.io/;https://github.com/yhun96;https://www.gabe-yhlee.com;http://robotics.snu.ac.kr", "dblp": "74/7997-3;;182/6796;p/FrankChongwooPark", "google_scholar": "https://scholar.google.com/citations?hl=en;;;u-h3PJIAAAAJ", "orcid": "0000-0001-6708-5684;;;0000-0002-0293-6975", "linkedin": "seungyeon-kim-45a20b263/;;;", "or_profile": "~Seungyeon_Kim2;~Young_Hun_Kim1;~Yonghyeon_Lee2;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;Seoul National University;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;snu.ac.kr", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nkim2023leveraging,\ntitle={Leveraging 3D Reconstruction for Mechanical Search on Cluttered Shelves},\nauthor={Seungyeon Kim and Young Hun Kim and Yonghyeon Lee and Frank C. Park},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=ycy47ZX0Oc}\n}", "github": "https://github.com/seungyeon-k/Search-for-Grasp-public", "project": "", "reviewers": "e7zE;bz1q;kape;QSGc", "site": "https://openreview.net/forum?id=ycy47ZX0Oc", "pdf_size": 0, "rating": "4;6;6;10", "confidence": "4;4;4;3", "rating_avg": 6.5, "confidence_avg": 3.75, "replies_avg": 21, "authors#_avg": 4, "corr_rating_confidence": -0.9271726499455306, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6355564076313154634&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Seoul National University", "aff_unique_dep": "", "aff_unique_url": "https://www.snu.ac.kr", "aff_unique_abbr": "SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "yobahDU4HPP", "title": "Learning Realistic Traf\ufb01c Agents in Closed-loop", "track": "main", "status": "Poster", "tldr": "We learn realistic traffic simulation agents in closed-loop using a joint IL + RL approach that exploits nominal offline-collected data and simulated long-tail scenarios.", "abstract": "Realistic traffic simulation is crucial for developing self-driving software in a safe and scalable manner prior to real-world deployment. Typically, imitation learning (IL) is used to learn human-like traffic agents directly from real-world observations collected offline, but without explicit specification of traffic rules, agents trained from IL alone frequently display unrealistic infractions like collisions and driving off the road. This problem is exacerbated in out-of-distribution and long-tail scenarios. On the other hand, reinforcement learning (RL) can train traffic agents to avoid infractions, but using RL alone results in unhuman-like driving behaviors. We propose Reinforcing Traffic Rules (RTR), a holistic closed-loop learning objective to match expert demonstrations under a traffic compliance constraint, which naturally gives rise to a joint IL + RL approach, obtaining the best of both worlds. Our method learns in closed-loop simulations of both nominal scenarios from real-world datasets as well as procedurally generated long-tail scenarios. Our experiments show that RTR learns more realistic and generalizable traffic simulation policies, achieving significantly better tradeoffs between human-like driving and traffic compliance in both nominal and long-tail scenarios. Moreover, when used as a data generation tool for training prediction models, our learned traffic policy leads to considerably improved downstream prediction metrics compared to baseline traffic agents.", "keywords": "Traf\ufb01c simulation;Imitation learning;Reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/81ff5d0f9e50dc6c395699c3ced04fdd2adaa24a.zip", "author": "Chris Zhang;James Tu;Lunjun Zhang;Kelvin Wong;Simon Suo;Raquel Urtasun", "authorids": "~Chris_Zhang2;~James_Tu1;~Lunjun_Zhang1;~Kelvin_Wong1;~Simon_Suo2;~Raquel_Urtasun1", "gender": "M;;M;F;M;M", "homepage": ";https://lunjunzhang.github.io/;https://www.cs.toronto.edu/~kelvinwong/;http://www.cs.toronto.edu/~urtasun/;;http://simonsuo.com", "dblp": ";274/6535;195/5219;u/RaquelUrtasun;52/6610-1;174/4170", "google_scholar": "https://scholar.google.ca/citations?user=x6gPeg4AAAAJ;OqD5GcgAAAAJ;K4EqMuAAAAAJ;https://scholar.google.ca/citations?user=jyxO2akAAAAJ;d0BhFY0AAAAJ;vq10uQMAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;sdsuo/", "or_profile": "~James_Tu1;~Lunjun_Zhang1;~Kelvin_Wong1;~Raquel_Urtasun1;~Chris_Jia_Han_Zhang1;~Simon_Suo1", "aff": "Department of Computer Science, University of Toronto;Waabi Innovation;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto;Department of Computer Science, University of Toronto", "aff_domain": "cs.toronto.edu;waabi.ai;cs.toronto.edu;cs.toronto.edu;cs.toronto.edu;cs.toronto.edu", "position": "PhD student;Researcher;PhD student;Full Professor;PhD student;PhD student", "bibtex": "@inproceedings{\nzhang2023learning,\ntitle={Learning Realistic Traffic Agents in Closed-loop},\nauthor={Chris Zhang and James Tu and Lunjun Zhang and Kelvin Wong and Simon Suo and Raquel Urtasun},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=yobahDU4HPP}\n}", "github": "", "project": "", "reviewers": "F38n;CEx1;ohnK;gvqu", "site": "https://openreview.net/forum?id=yobahDU4HPP", "pdf_size": 0, "rating": "4;6;6;6", "confidence": "4;5;3;4", "rating_avg": 5.5, "confidence_avg": 4.0, "replies_avg": 21, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12533532343938095225&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "University of Toronto;Waabi Innovation", "aff_unique_dep": "Department of Computer Science;", "aff_unique_url": "https://www.utoronto.ca;https://waabi.ai", "aff_unique_abbr": "U of T;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Canada" }, { "id": "z3D__-nc9y", "title": "Autonomous Robotic Reinforcement Learning with Asynchronous Human Feedback", "track": "main", "status": "Poster", "tldr": "", "abstract": "Ideally, we would place a robot in a real-world environment and leave it there improving on its own by gathering more experience autonomously. However, algorithms for autonomous robotic learning have been challenging to realize in the real world. While this has often been attributed to the challenge of sample complexity, even sample-efficient techniques are hampered by two major challenges - the difficulty of providing well ``shaped\" rewards, and the difficulty of continual reset-free training. In this work, we describe a system for real-world reinforcement learning that enables agents to show continual improvement by training directly in the real world without requiring painstaking effort to hand-design reward functions or reset mechanisms. Our system leverages occasional non-expert human-in-the-loop feedback from remote users to learn informative distance functions to guide exploration while leveraging a simple self-supervised learning algorithm for goal-directed policy learning. We show that in the absence of resets, it is particularly important to account for the current ``reachability\" of the exploration policy when deciding which regions of the space to explore. Based on this insight, we instantiate a practical learning system - GEAR, which enables robots to simply be placed in real-world environments and left to train autonomously without interruption. \nThe system streams robot experience to a web interface only requiring occasional asynchronous feedback from remote, crowdsourced, non-expert humans in the form of binary comparative feedback. \nWe evaluate this system on a suite of robotic tasks in simulation and demonstrate its effectiveness at learning behaviors both in simulation and the real world. \nProject website https://guided-exploration-autonomous-rl.github.io/GEAR/.", "keywords": "reset-free reinforcement learning;learning from human feedback", "primary_area": "", "supplementary_material": "/attachment/8abe2787279f421563c10d08445c62a311ade224.zip", "author": "Max Balsells I Pamies;Marcel Torne Villasevil;Zihan Wang;Samedh Desai;Pulkit Agrawal;Abhishek Gupta", "authorids": "~Max_Balsells_I_Pamies1;~Marcel_Torne_Villasevil1;~Zihan_Wang14;~Samedh_Desai1;~Pulkit_Agrawal1;~Abhishek_Gupta1", "gender": "M;M;M;M;M;M", "homepage": "https://www.linkedin.com/in/max-balsells/;https://marceltorne.github.io;https://avinwangzh.github.io/;;https://people.eecs.berkeley.edu/~pulkitag/;https://homes.cs.washington.edu/~abhgupta/", "dblp": ";352/5363;;;149/2672;18/6404-4", "google_scholar": ";ITlelQ8AAAAJ;;;UpZmJI0AAAAJ;1wLVDP4AAAAJ", "orcid": ";;;;;", "linkedin": ";marceltorne/;;sdesai1287;;", "or_profile": "~Max_Balsells_I_Pamies1;~Marcel_Torne_Villasevil1;~Zihan_Wang14;~Samedh_Desai1;~Pulkit_Agrawal1;~Abhishek_Gupta1", "aff": "Universidad Polit\u00e9cnica de Cataluna;Harvard University, Harvard University;University of Washington;University of Washington;Massachusetts Institute of Technology;University of Washington", "aff_domain": "upc.edu;g.harvard.edu;uw.edu;uw.edu;mit.edu;uw.edu", "position": "Undergrad student;MS student;PhD student;Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npamies2023autonomous,\ntitle={Autonomous Robotic Reinforcement Learning with Asynchronous Human Feedback},\nauthor={Max Balsells I Pamies and Marcel Torne Villasevil and Zihan Wang and Samedh Desai and Pulkit Agrawal and Abhishek Gupta},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=z3D__-nc9y}\n}", "github": "https://github.com/guided-exploration-autonomous-rl/gear-code/tree/main", "project": "", "reviewers": "yNZB;SMKn;epUg;oFcp", "site": "https://openreview.net/forum?id=z3D__-nc9y", "pdf_size": 0, "rating": "6;6;6;6", "confidence": "4;4;4;3", "rating_avg": 6.0, "confidence_avg": 3.75, "replies_avg": 16, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10977848610267689724&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;3;2", "aff_unique_norm": "Universitat Polit\u00e8cnica de Catalunya;Harvard University;University of Washington;Massachusetts Institute of Technology", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.upc.edu;https://www.harvard.edu;https://www.washington.edu;https://web.mit.edu", "aff_unique_abbr": "UPC;Harvard;UW;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Spain;United States" }, { "id": "zUiH8UUYDo", "title": "Scalable Deep Kernel Gaussian Process for Vehicle Dynamics in Autonomous Racing", "track": "main", "status": "Poster", "tldr": "In this work, we have proven that DKL-SKIP, as a scalable deep kernel learning for Gaussian Process, is a promising tool for modeling complex vehicle dynamics in both real-world and simulated environments.", "abstract": "Autonomous racing presents a challenging environment for testing the limits of autonomous vehicle technology. Accurately modeling the vehicle dynamics (with all forces and tires) is critical for high-speed racing, but it remains a difficult task and requires an intricate balance between run-time computational demands and modeling complexity. Researchers have proposed utilizing learning-based methods such as Gaussian Process (GP) for learning vehicle dynamics. However, current approaches often oversimplify the modeling process or apply strong assumptions, leading to unrealistic results that cannot translate to real-world settings.\nIn this paper, we proposed DKL-SKIP method for vehicle dynamics modeling.\nOur approach outperforms standard GP methods and the N4SID system identification technique in terms of prediction accuracy. \nIn addition to evaluating DKL-SKIP on real-world data, we also evaluate its performance using a high-fidelity autonomous racing AutoVerse simulator.\nThe results highlight the potential of DKL-SKIP as a promising tool for modeling complex vehicle dynamics in both real-world and simulated environments.", "keywords": "Gaussian Process;Vehicle Dynamics;Autonomous Vehicle;Deep Kernel Learning", "primary_area": "", "supplementary_material": "/attachment/e739398d3b2f4d8ccbe343dbe1cff95af6663992.zip", "author": "Jingyun Ning;Madhur Behl", "authorids": "~Jingyun_Ning1;~Madhur_Behl1", "gender": "M;M", "homepage": ";https://www.madhurbehl.com/", "dblp": ";10/9662", "google_scholar": ";https://scholar.google.com.tw/citations?user=bj_imaYAAAAJ", "orcid": ";", "linkedin": "jingyun-ning-414513128/;madhurbehl/", "or_profile": "~Jingyun_Ning1;~Madhur_Behl1", "aff": "University of Virginia, Charlottesville;University of Virginia", "aff_domain": "virginia.edu;virginia.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nning2023scalable,\ntitle={Scalable Deep Kernel Gaussian Process for Vehicle Dynamics in Autonomous Racing},\nauthor={Jingyun Ning and Madhur Behl},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=zUiH8UUYDo}\n}", "github": "", "project": "", "reviewers": "Fjkf;nK5o;AEJt;c6Hr", "site": "https://openreview.net/forum?id=zUiH8UUYDo", "pdf_size": 0, "rating": "1;6;6;6", "confidence": "5;3;5;3", "rating_avg": 4.75, "confidence_avg": 4.0, "replies_avg": 12, "authors#_avg": 2, "corr_rating_confidence": -0.5773502691896257, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18153464067837405251&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "University of Virginia", "aff_unique_dep": "", "aff_unique_url": "https://www.virginia.edu", "aff_unique_abbr": "UVA", "aff_campus_unique_index": "0", "aff_campus_unique": "Charlottesville;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "zvl2LuLTtgr", "title": "What Went Wrong? Closing the Sim-to-Real Gap via Differentiable Causal Discovery", "track": "main", "status": "Poster", "tldr": "We introduce an approach that aims to align the simulator with the real world by discovering the causality between the environment parameters and the sim-to-real gap.", "abstract": "Training control policies in simulation is more appealing than on real robots directly, as it allows for exploring diverse states in an efficient manner. Yet, robot simulators inevitably exhibit disparities from the real-world \\rebut{dynamics}, yielding inaccuracies that manifest as the dynamical simulation-to-reality (sim-to-real) gap. Existing literature has proposed to close this gap by actively modifying specific simulator parameters to align the simulated data with real-world observations. However, the set of tunable parameters is usually manually selected to reduce the search space in a case-by-case manner, which is hard to scale up for complex systems and requires extensive domain knowledge. To address the scalability issue and automate the parameter-tuning process, we introduce COMPASS, which aligns the simulator with the real world by discovering the causal relationship between the environment parameters and the sim-to-real gap. Concretely, our method learns a differentiable mapping from the environment parameters to the differences between simulated and real-world robot-object trajectories. This mapping is governed by a simultaneously learned causal graph to help prune the search space of parameters, provide better interpretability, and improve generalization on unseen parameters. We perform experiments to achieve both sim-to-sim and sim-to-real transfer, and show that our method has significant improvements in trajectory alignment and task success rate over strong baselines in several challenging manipulation tasks. Demos are available on our project website: https://sites.google.com/view/sim2real-compass.", "keywords": "sim-to-real gap;reinforcement learning;causal discovery", "primary_area": "", "supplementary_material": "/attachment/a8766700e74d1b3f4a3c9173437babdcd58a2a53.zip", "author": "Peide Huang;Xilun Zhang;Ziang Cao;Shiqi Liu;Mengdi Xu;Wenhao Ding;Jonathan Francis;Bingqing Chen;Ding Zhao", "authorids": "~Peide_Huang1;~Xilun_Zhang1;~Ziang_Cao2;~Shiqi_Liu2;~Mengdi_Xu3;~Wenhao_Ding1;~Jonathan_Francis1;~Bingqing_Chen2;~Ding_Zhao1", "gender": ";M;;M;F;M;;F;", "homepage": "https://peidehuang.github.io/;https://github.com/XilunZhangRobo;;https://shiqiliu-67.github.io/;https://mxu34.github.io/;https://wenhao.pub;;;https://safeai-lab.github.io", "dblp": "295/8645;;;;;215/3667.html;;;", "google_scholar": "g5U-sjoAAAAJ;;;PiuAi5wAAAAJ;https://scholar.google.com/citations?hl=zh-CN;q2aqI9sAAAAJ;;LYt_2MgAAAAJ;z7tPc9IAAAAJ", "orcid": ";;;;0000-0001-9332-4175;;;;", "linkedin": "peidehuang/;;;shiqiliu2/;;wenhaoding/;;bingqing-chen-631b754a/;", "or_profile": "~Peide_Huang1;~Xilun_Zhang1;~Ziang_Cao2;~Shiqi_Liu2;~Mengdi_Xu3;~Wenhao_Ding1;~Jonathan_Francis1;~Bingqing_Chen2;~Ding_Zhao1", "aff": "Carnegie Mellon University;Carnegie Mellon University;;;Carnegie Mellon University;Carnegie Mellon University;;Bosch;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;;;cmu.edu;cmu.edu;;bosch.com;cmu.edu", "position": "PhD student;MS student;;;PhD student;PhD student;;Researcher;Associate Professor", "bibtex": "@inproceedings{\nhuang2023what,\ntitle={What Went Wrong? Closing the Sim-to-Real Gap via Differentiable Causal Discovery},\nauthor={Peide Huang and Xilun Zhang and Ziang Cao and Shiqi Liu and Mengdi Xu and Wenhao Ding and Jonathan Francis and Bingqing Chen and Ding Zhao},\nbooktitle={7th Annual Conference on Robot Learning},\nyear={2023},\nurl={https://openreview.net/forum?id=zvl2LuLTtgr}\n}", "github": "", "project": "", "reviewers": "mBeP;7MJV;Ltn4;UgPf", "site": "https://openreview.net/forum?id=zvl2LuLTtgr", "pdf_size": 0, "rating": "6;6;10;10", "confidence": "4;3;4;4", "rating_avg": 8.0, "confidence_avg": 3.75, "replies_avg": 21, "authors#_avg": 9, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=524364987362971013&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;1;0", "aff_unique_norm": "Carnegie Mellon University;Robert Bosch GmbH", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.bosch.com", "aff_unique_abbr": "CMU;Bosch", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1;0", "aff_country_unique": "United States;Germany" } ]