[ { "id": "0M7JiV1GFN", "title": "Provably Safe Online Multi-Agent Navigation in Unknown Environments", "track": "main", "status": "Poster", "tldr": "", "abstract": "Control Barrier Functions (CBFs) provide safety guarantees for multi-agent navigation. However, traditional approaches require full knowledge of the environment (e.g., obstacle positions and shapes) to formulate CBFs and hence, are not applicable in unknown environments. This paper overcomes this issue by proposing an Online Exploration-based Control Lyapunov Barrier Function (OE-CLBF) controller. It estimates the unknown environment by learning its corresponding CBF with a Support Vector Machine (SVM) in an online manner, using local neighborhood information, and leverages the latter to generate actions for safe navigation. To reduce the computation incurred by the online SVM training, we use an Imitation Learning (IL) framework to predict the importance of neighboring agents with Graph Attention Networks (GATs), and train the SVM only with information received from neighbors of high `value'. The OE-CLBF allows for decentralized deployment, and importantly, provides provable safety guarantees that we derive in this paper. Experiments corroborate theoretical findings and demonstrate superior performance w.r.t. state-of-the-art baselines in a variety of unknown environments.", "keywords": "Decentralized Multi-Agent Navigation;Unknown Environment;Support Vector Machine;Graph Attention Learning;Control Barrier Function", "primary_area": "", "supplementary_material": "/attachment/d984985e22ac8b5dbd70289ce301727caa08fd7e.zip", "author": "Zhan Gao;Guang Yang;Jasmine Bayrooti;Amanda Prorok", "authorids": "~Zhan_Gao1;~Guang_Yang16;~Jasmine_Bayrooti1;~Amanda_Prorok1", "gender": ";M;F;", "homepage": ";https://www.guang.phd;;", "dblp": ";;;", "google_scholar": ";Nw340R8AAAAJ;;", "orcid": ";;;", "linkedin": ";gyang101/;jasmine-bayrooti/;", "or_profile": "~Zhan_Gao1;~Guang_Yang16;~Jasmine_Bayrooti1;~Amanda_Prorok1", "aff": ";University of Cambridge;University of Cambridge;", "aff_domain": ";cam.ac.uk;cam.ac.uk;", "position": ";Postdoc;PhD student;", "bibtex": "@inproceedings{\ngao2024provably,\ntitle={Provably Safe Online Multi-Agent Navigation in Unknown Environments},\nauthor={Zhan Gao and Guang Yang and Jasmine Bayrooti and Amanda Prorok},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0M7JiV1GFN}\n}", "github": "", "project": "", "reviewers": "nmUB;wuRr;tTRi", "site": "https://openreview.net/forum?id=0M7JiV1GFN", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15023334405611234595&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "0gDbaEtVrd", "title": "One Model to Drift Them All: Physics-Informed Conditional Diffusion Model for Driving at the Limits", "track": "main", "status": "Poster", "tldr": "", "abstract": "Enabling autonomous vehicles to reliably operate at the limits of handling\u2014 where tire forces are saturated \u2014 would improve their safety, particularly in scenarios like emergency obstacle avoidance or adverse weather conditions.\nHowever, unlocking this capability is challenging due to the task's dynamic nature and the high sensitivity to uncertain multimodal properties of the road, vehicle, and their dynamic interactions.\nMotivated by these challenges, we propose a framework to learn a conditional diffusion model for high-performance vehicle control using an unlabelled multimodal trajectory dataset.\nWe design the diffusion model to capture the distribution of parameters of a physics-informed data-driven dynamics model.\nBy conditioning the generation process on online measurements, we integrate the diffusion model into a real-time model predictive control framework for driving at the limits, and show that it can adapt on the fly to a given vehicle and environment.\nExtensive experiments on a Toyota Supra and a Lexus LC 500 show that a single diffusion model enables reliable autonomous drifting on both vehicles when operating with different tires in varying road conditions.\nThe model matches the performance of task-specific expert models while outperforming them in generalization to unseen conditions, paving the way towards a general, reliable method for autonomous driving at the limits of handling.", "keywords": "Diffusion Models;Learning for Control;Autonomous Drifting;Model Predictive Control", "primary_area": "", "supplementary_material": "", "author": "Franck Djeumou;Thomas Jonathan Lew;NAN DING;Michael Thompson;Makoto Suminaka;Marcus Greiff;John Subosits", "authorids": "~Franck_Djeumou1;~Thomas_Jonathan_Lew1;~NAN_DING5;~Michael_Thompson5;~Makoto_Suminaka1;~Marcus_Greiff1;~John_Subosits1", "gender": ";M;F;;;M;", "homepage": ";https://thomasjlew.github.io/;;;;https://www.merl.com/people/greiff;", "dblp": ";;;;;224/8045;381/8269", "google_scholar": ";;;;;ThEpJMwAAAAJ;ixdjPGUAAAAJ", "orcid": ";;;0000-0002-3269-7987;;;", "linkedin": ";;nan-ding-b428a72a0/;;;;", "or_profile": "~Franck_Djeumou1;~Thomas_Jonathan_Lew1;~NAN_DING5;~Michael_Thompson5;~Makoto_Suminaka1;~Marcus_Greiff1;~John_Subosits1", "aff": ";Toyota Research Institute;Toyota Research Institute;Toyota Research Institute;;Mitsubishi Electric Research Labs;Toyota Motor Corporation", "aff_domain": ";tri.global;tri.global;tri.global;;merl.com;toyota.com", "position": ";Researcher;Researcher;Researcher;;Researcher;Researcher", "bibtex": "@inproceedings{\ndjeumou2024one,\ntitle={One Model to Drift Them All: Physics-Informed Conditional Diffusion Model for Driving at the Limits},\nauthor={Franck Djeumou and Thomas Jonathan Lew and NAN DING and Michael Thompson and Makoto Suminaka and Marcus Greiff and John Subosits},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=0gDbaEtVrd}\n}", "github": "", "project": "", "reviewers": "ceJU;3p3m;Xoo7", "site": "https://openreview.net/forum?id=0gDbaEtVrd", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16678681261814609729&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "Toyota Research Institute;Mitsubishi Electric Research Laboratories;Toyota Motor Corporation", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tri.global;https://www.merl.com;https://www.toyota-global.com", "aff_unique_abbr": "TRI;MERL;Toyota", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;1", "aff_country_unique": "United States;Japan" }, { "id": "1IzW0aniyg", "title": "EscIRL: Evolving Self-Contrastive IRL for Trajectory Prediction in Autonomous Driving", "track": "main", "status": "Poster", "tldr": "", "abstract": "While deep neural networks (DNN) and inverse reinforcement learning (IRL) have both been commonly used in autonomous driving to predict trajectories through learning from expert demonstrations, DNN-based methods suffer from data-scarcity, while IRL-based approaches often struggle with generalizability, making both hard to apply to new driving scenarios. To address these issues, we introduce EscIRL, a novel decoupled bi-level training framework that iteratively learns robust reward models from only a few mixed-scenario demonstrations. At the inner level, EscIRL introduces a self-contrastive IRL module that learns a spectrum of specialized reward functions by contrasting demonstrations across different scenarios. At the outer level, ESCIRL employs an evolving loop that iteratively refines the contrastive sets, ensuring global convergence. Experiments on two multi-scenario datasets, CitySim and INTERACTION, demonstrate the effectiveness of EscIRL, outperforming state-of-the-art DNN and IRL-based methods by 41.3% on average. Notably, we show that EscIRL achieves superior generalizability compared to DNN-based approaches while requiring only a small fraction of the data, effectively addressing data-scarcity constraints. All code and data are available at https://github.com/SiyueWang-CiDi/EscIRL.", "keywords": "Reinforcement Learning;Trajectory Prediction;Autonomous Driving", "primary_area": "", "supplementary_material": "", "author": "Siyue Wang;Zhaorun Chen;Zhuokai Zhao;Chaoli Mao;Yiyang Zhou;Jiayu He;Albert Sibo Hu", "authorids": "~Siyue_Wang4;~Zhaorun_Chen1;~Zhuokai_Zhao1;~Chaoli_Mao1;~Yiyang_Zhou1;~Jiayu_He1;~Albert_Sibo_Hu1", "gender": "M;M;M;M;M;M;M", "homepage": ";https://billchan226.github.io/;https://zhuokai-zhao.com/;;https://yiyangzhou.github.io/;;https://www.linkedin.com/in/al-hu-sibo/", "dblp": ";302/1064;348/5348;;175/1589.html;;", "google_scholar": ";UZg5N5UAAAAJ;EGcdEjEAAAAJ;https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com.hk/citations?user=6KltFMAAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0003-4753-3819;0000-0002-2668-6587;0000-0001-8201-2977;;;;", "linkedin": ";zhaorun-chen-1793b6226/;zhuokai-zhao-a9385169/;;;;al-hu-sibo/", "or_profile": "~Siyue_Wang4;~Zhaorun_Chen1;~Zhuokai_Zhao1;~Chaoli_Mao1;~Yiyang_Zhou1;~Jiayu_He1;~Albert_Sibo_Hu1", "aff": "CIDI Intelligent Driving(Chengdu) Technology Co., Ltd.;University of Chicago;University of Chicago;CIDI;Xi'an Jiaotong University;cidi-lab;CiDi", "aff_domain": "cidi.ai;uchicago.edu;uchicago.edu;cidi.ai;xjtu.edu.cn;cidi.cn;cidi.ai", "position": "Researcher;PhD student;PhD student;Researcher;MS student;Researcher;Researcher", "bibtex": "@inproceedings{\nwang2024escirl,\ntitle={Esc{IRL}: Evolving Self-Contrastive {IRL} for Trajectory Prediction in Autonomous Driving},\nauthor={Siyue Wang and Zhaorun Chen and Zhuokai Zhao and Chaoli Mao and Yiyang Zhou and Jiayu He and Albert Sibo Hu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1IzW0aniyg}\n}", "github": "https://github.com/SiyueWang-CiDi/EscIRL", "project": "", "reviewers": "iJ2t;vT2J;RjKv", "site": "https://openreview.net/forum?id=1IzW0aniyg", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16013117666416725039&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;2;3;4;5", "aff_unique_norm": "CIDI Intelligent Driving Technology Co., Ltd.;University of Chicago;Center for International Development and Integration;Xi'an Jiao Tong University;cidi-lab;CiDi", "aff_unique_dep": ";;;;;", "aff_unique_url": ";https://www.uchicago.edu;;https://www.xjtu.edu.cn;;", "aff_unique_abbr": ";UChicago;;XJTU;;", "aff_campus_unique_index": "0", "aff_campus_unique": "Chengdu;", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "China;United States;" }, { "id": "1TEZ1hiY5m", "title": "Learning Robotic Locomotion Affordances and Photorealistic Simulators from Human-Captured Data", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning reliable affordance models which satisfy human preferences is often hindered by a lack of high-quality training data. Similarly, learning visuomotor policies in simulation can be challenging due to the high cost of photo-realistic rendering. We present PAWS: a comprehensive robot learning framework that uses a novel portable data capture rig and processing pipeline to collect long-horizon trajectories that include camera poses, foot poses, terrain meshes, and 3D radiance fields. We also contribute PAWS-Data: an extensive dataset gathered with PAWS containing over 10 hours of indoor and outdoor trajectories spanning a variety of scenes. With PAWS-Data we leverage radiance fields' photo-realistic rendering to generate tens of thousands of viewpoint-augmented images, then produce pixel affordance labels by identifying semantically similar regions to those traversed by the user. On this data we finetune a navigation affordance model from a pretrained backbone, and perform detailed ablations. Additionally, We open source PAWS-Sim, a high-speed photo-realistic simulator which integrates PAWS-Data with IsaacSim, enabling research for visuomotor policy learning. We evaluate the utility of the affordance model on a quadrupedal robot, which plans through affordances to follow pathways and sidewalks, and avoid human collisions. Project resources are available on the [website](https://pawslocomotion.com).", "keywords": "Navigation;Dataset;Real2Sim", "primary_area": "", "supplementary_material": "/attachment/abdd8309b2555462a6e48dba872e27d4256670b1.zip", "author": "Alejandro Escontrela;Justin Kerr;Kyle Stachowicz;Pieter Abbeel", "authorids": "~Alejandro_Escontrela1;~Justin_Kerr1;~Kyle_Stachowicz1;~Pieter_Abbeel2", "gender": "M;M;M;M", "homepage": "https://www.escontrela.me;https://kerrj.github.io/;https://kylesta.ch;https://people.eecs.berkeley.edu/~pabbeel/", "dblp": ";;;", "google_scholar": "53OxjmYAAAAJ;;;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ", "orcid": ";;;", "linkedin": "alejandro-escontrela/;;;", "or_profile": "~Alejandro_Escontrela1;~Justin_Kerr1;~Kyle_Stachowicz1;~Pieter_Abbeel2", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Covariant", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;covariant.ai", "position": "PhD student;PhD student;PhD student;Founder", "bibtex": "@inproceedings{\nescontrela2024learning,\ntitle={Learning Robotic Locomotion Affordances and Photorealistic Simulators from Human-Captured Data},\nauthor={Alejandro Escontrela and Justin Kerr and Kyle Stachowicz and Pieter Abbeel},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1TEZ1hiY5m}\n}", "github": "", "project": "", "reviewers": "V5mu;gXMo;ZQSK", "site": "https://openreview.net/forum?id=1TEZ1hiY5m", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;3;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18038620012236506581&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "1jc2zA5Z6J", "title": "Get a Grip: Multi-Finger Grasp Evaluation at Scale Enables Robust Sim-to-Real Transfer", "track": "main", "status": "Poster", "tldr": "", "abstract": "This work explores conditions under which multi-finger grasping algorithms can attain robust sim-to-real transfer. While numerous large datasets facilitate learning *generative* models for multi-finger grasping at scale, reliable real-world dexterous grasping remains challenging, with most methods degrading when deployed on hardware. An alternate strategy is to use *discriminative* grasp evaluation models for grasp selection and refinement, conditioned on real-world sensor measurements. This paradigm has produced state-of-the-art results for vision-based parallel-jaw grasping, but remains unproven in the multi-finger setting. In this work, we find that existing datasets and methods have been insufficient for training discriminitive models for multi-finger grasping. To train grasp evaluators at scale, datasets must provide on the order of millions of grasps, including both positive *and negative examples*, with corresponding visual data resembling measurements at inference time. To that end, we release a new, open-source dataset of 3.5M grasps on 4.3K objects annotated with RGB images, point clouds, and trained NeRFs. Leveraging this dataset, we train vision-based grasp evaluators that outperform both analytic and generative modeling-based baselines on extensive simulated and real-world trials across a diverse range of objects. We show via numerous ablations that the key factor for performance is indeed the evaluator, and that its quality degrades as the dataset shrinks, demonstrating the importance of our new dataset. Project website at: https://sites.google.com/view/get-a-grip-dataset.", "keywords": "Multi-Fingered Grasping;Large-Scale Grasp Dataset;Sim-to-Real", "primary_area": "", "supplementary_material": "/attachment/f57e56b1137832862b892162a2d7454e9efeda67.zip", "author": "Tyler Ga Wei Lum;Albert H. Li;Preston Culbertson;Krishnan Srinivasan;Aaron Ames;Mac Schwager;Jeannette Bohg", "authorids": "~Tyler_Ga_Wei_Lum1;~Albert_H._Li1;~Preston_Culbertson1;~Krishnan_Srinivasan1;~Aaron_Ames2;~Mac_Schwager1;~Jeannette_Bohg1", "gender": "M;;;M;;M;", "homepage": "https://tylerlum.github.io/;;https://pculbertson.github.io;http://github.com/krishpop;;https://msl.stanford.edu/;https://web.stanford.edu/~bohg/", "dblp": ";;203/8247;02/4773;;22/7012;52/7377", "google_scholar": "kPq6-XIAAAAJ;;8v2kg_0AAAAJ;;TjWwqmwAAAAJ;-EqbTXoAAAAJ;rjnJnEkAAAAJ", "orcid": ";;;;;;0000-0002-4921-7193", "linkedin": "tyler-lum/;;;;;;", "or_profile": "~Tyler_Ga_Wei_Lum1;~Albert_H._Li1;~Preston_Culbertson1;~Krishnan_Srinivasan1;~Aaron_Ames2;~Mac_Schwager1;~Jeannette_Bohg1", "aff": "Stanford University;;California Institute of Technology;Stanford University;California Institute of Technology;Stanford University;Stanford University", "aff_domain": "stanford.edu;;caltech.edu;stanford.edu;caltech.edu;stanford.edu;stanford.edu", "position": "PhD student;;Postdoc;PhD student;Bren Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nlum2024get,\ntitle={Get a Grip: Multi-Finger Grasp Evaluation at Scale Enables Robust Sim-to-Real Transfer},\nauthor={Tyler Ga Wei Lum and Albert H. Li and Preston Culbertson and Krishnan Srinivasan and Aaron Ames and Mac Schwager and Jeannette Bohg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1jc2zA5Z6J}\n}", "github": "https://github.com/tylerlum/get_a_grip", "project": "", "reviewers": "k59v;DT3i;JEWj", "site": "https://openreview.net/forum?id=1jc2zA5Z6J", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 7, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1977971607442443196&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;1;0;0", "aff_unique_norm": "Stanford University;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.caltech.edu", "aff_unique_abbr": "Stanford;Caltech", "aff_campus_unique_index": "0;1;0;1;0;0", "aff_campus_unique": "Stanford;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "1tCteNSbFH", "title": "Trajectory Improvement and Reward Learning from Comparative Language Feedback", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning from human feedback has gained traction in fields like robotics and natural language processing in recent years. While prior works mostly rely on human feedback in the form of comparisons, language is a preferable modality that provides more informative insights into user preferences. In this work, we aim to incorporate comparative language feedback to iteratively improve robot trajectories and to learn reward functions that encode human preferences. To achieve this goal, we learn a shared latent space that integrates trajectory data and language feedback, and subsequently leverage the learned latent space to improve trajectories and learn human preferences. To the best of our knowledge, we are the first to incorporate comparative language feedback into reward learning. Our simulation experiments demonstrate the effectiveness of the learned latent space and the success of our learning algorithms. We also conduct human subject studies that show our reward learning algorithm achieves a 23.9% higher subjective score on average and is 11.3% more time-efficient compared to preference-based reward learning, underscoring the superior performance of our method. Our website is at https://liralab.usc.edu/comparative-language-feedback/.", "keywords": "Learning from Human Language Feedback;Reward Learning;Human-Robot Interaction", "primary_area": "", "supplementary_material": "/attachment/993fe7f9ebd8f0878d8a1f80c391eb4abd807e2c.zip", "author": "Zhaojing Yang;Miru Jun;Jeremy Tien;Stuart Russell;Anca Dragan;Erdem Biyik", "authorids": "~Zhaojing_Yang1;~Miru_Jun1;~Jeremy_Tien1;~Stuart_Russell1;~Anca_Dragan1;~Erdem_Biyik1", "gender": "M;F;M;M;F;M", "homepage": "https://github.com/yang-zj1026;;;https://people.eecs.berkeley.edu/~russell/;http://www.ancadragan.com/;http://people.eecs.berkeley.edu/~ebiyik/", "dblp": ";;;;;194/2736", "google_scholar": ";AalqHJ4AAAAJ;;https://scholar.google.com.tw/citations?user=KJGrjCAAAAAJ;;https://scholar.google.com.tr/citations?user=P-G3sjYAAAAJ", "orcid": ";;;;;0000-0002-9516-3130", "linkedin": ";;jeremy-tien/;;;https://linkedin.com/in/ebiyik", "or_profile": "~Zhaojing_Yang1;~Miru_Jun1;~Jeremy_Tien1;~Stuart_Russell1;~Anca_Dragan1;~Erdem_Biyik1", "aff": "University of Southern California;University of Southern California;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of Southern California", "aff_domain": "usc.edu;usc.edu;berkeley.edu;berkeley.edu;berkeley.edu;usc.edu", "position": "MS student;Undergrad student;Undergrad student;Full Professor;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024trajectory,\ntitle={Trajectory Improvement and Reward Learning from Comparative Language Feedback},\nauthor={Zhaojing Yang and Miru Jun and Jeremy Tien and Stuart Russell and Anca Dragan and Erdem Biyik},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=1tCteNSbFH}\n}", "github": "https://github.com/USC-Lira/language-preference-learning", "project": "", "reviewers": "1rvf;e57u;KGNz", "site": "https://openreview.net/forum?id=1tCteNSbFH", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12574631017765717475&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "University of Southern California;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.usc.edu;https://www.berkeley.edu", "aff_unique_abbr": "USC;UC Berkeley", "aff_campus_unique_index": "0;0;1;1;1;0", "aff_campus_unique": "Los Angeles;Berkeley", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "2AZfKk9tRI", "title": "Multi-agent Reinforcement Learning with Hybrid Action Space for Free Gait Motion Planning of Hexapod Robots", "track": "main", "status": "Poster", "tldr": "", "abstract": "Legged robots are able to overcome challenging terrains through diverse gaits formed by contact sequences. However, environments characterized by discrete footholds present significant challenges. In this paper, we tackle the problem of free gait motion planning for hexapod robots walking in randomly generated plum blossom pile environments. Specifically, we first address the complexity of multi-leg coordination in discrete environments by treating each leg of the hexapod robot as an individual agent. Then, we propose the Hybrid action space Multi-Agent Soft Actor Critic (Hybrid-MASAC) algorithm capable of handling both discrete and continuous actions. Finally, we present an integrated free gait motion planning method based on Hybrid-MASAC, streamlining gait, Center of Mass (COM), and foothold sequences planning into a single model. Comparative and ablation experiments in both of the simulated and real plum blossom pile environments demonstrate the feasibility and efficiency of our method.", "keywords": "Free Gait;Hexapod Robot;Hybrid Action Space;Multi-agent Reinforcement Learning", "primary_area": "", "supplementary_material": "", "author": "Huiqiao Fu;Kaiqiang Tang;Peng Li;Guizhou Deng;Chunlin Chen", "authorids": "~Huiqiao_Fu1;~Kaiqiang_Tang1;~Peng_Li10;~Guizhou_Deng1;~Chunlin_Chen1", "gender": "M;M;M;;M", "homepage": ";https://scholar.google.com/citations?user=gkKO99wAAAAJ&hl=zh-CN;https://pengli80.github.io/;;https://sme.nju.edu.cn/ccl/list.htm", "dblp": "243/7065;;;;68/6992.html", "google_scholar": ";gkKO99wAAAAJ;;;", "orcid": ";;;0000-0001-6589-4402;", "linkedin": ";;;;", "or_profile": "~Huiqiao_Fu1;~Kaiqiang_Tang1;~Peng_Li10;~Guizhou_Deng1;~Chunlin_Chen1", "aff": "Nanjing University;Nanjing University;Institute of Software, Chinese Academy of Sciences;;Nanjing University", "aff_domain": "nju.edu.cn;smail.nju.edu.cn;iscas.ac.cn;;nju.edu.cn", "position": "PhD student;PhD student;Associate Professor;;Full Professor", "bibtex": "@inproceedings{\nfu2024multiagent,\ntitle={Multi-agent Reinforcement Learning with Hybrid Action Space for Free Gait Motion Planning of Hexapod Robots},\nauthor={Huiqiao Fu and Kaiqiang Tang and Peng Li and Guizhou Deng and Chunlin Chen},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2AZfKk9tRI}\n}", "github": "", "project": "", "reviewers": "XYuJ;oUGJ;z2Hc", "site": "https://openreview.net/forum?id=2AZfKk9tRI", "pdf_size": 0, "rating": "2;2;3", "confidence": "3;3;4", "rating_avg": 2.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:gcsrUKvYDM0J:scholar.google.com/&scioq=Multi-agent+Reinforcement+Learning+with+Hybrid+Action+Space+for+Free+Gait+Motion+Planning+of+Hexapod+Robots&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Nanjing University;Chinese Academy of Sciences", "aff_unique_dep": ";Institute of Software", "aff_unique_url": "https://www.nju.edu.cn;http://www.ios.ac.cn", "aff_unique_abbr": "Nanjing U;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "2CScZqkUPZ", "title": "Genetic Algorithm for Curriculum Design in Multi-Agent Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "As the deployment of autonomous agents in real-world scenarios grows, so does the interest in their application to competitive environments with other robots. Self-play in Reinforcement Learning (RL) enables agents to develop competitive strategies. However, the complexity arising from multi-agent interactions and the tendency for RL agents to disrupt competitors' training introduce instability and a risk of overfitting. While traditional methods depend on costly Nash equilibrium approximations or random exploration for training scenario optimization, this can be inefficient in large search spaces often prevalent in multi-agent problems. However, related works in single-agent setups show that genetic algorithms perform better in large scenario spaces. Therefore, we propose using genetic algorithms to adaptively adjust environment parameters and opponent policies in a multi-agent context to find and synthesize coherent scenarios efficiently. We also introduce GenOpt Agent\u2014a genetically optimized, open-loop agent executing scheduled actions. The open-loop aspect of GenOpt prevents RL agents from winning through adversarial perturbations, thereby fostering generalizable strategies. Also, GenOpt is genetically optimized without expert supervision, negating the need for expensive expert supervision to have meaningful opponents at the start of training. Our empirical studies indicate that this method surpasses several established baselines in two-player competitive settings with continuous action spaces, validating its effectiveness and stability in training.", "keywords": "Reinforcement Learning;Multiagent Learning;Curricular Learning", "primary_area": "", "supplementary_material": "/attachment/d163caeab2d1892e26d1bd7fbde7bc6ae0595a39.zip", "author": "Yeeho Song;Jeff Schneider", "authorids": "~Yeeho_Song1;~Jeff_Schneider1", "gender": "M;", "homepage": ";https://www.cs.cmu.edu/~schneide", "dblp": "203/5532.html;38/247", "google_scholar": "Qj1bwsAAAAAJ;3bSbb20AAAAJ", "orcid": ";0000-0002-5080-9073", "linkedin": ";jeff-schneider-1593b322/", "or_profile": "~Yeeho_Song1;~Jeff_Schneider1", "aff": "Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cs.cmu.edu", "position": "PhD student;Researcher", "bibtex": "@inproceedings{\nsong2024genetic,\ntitle={Genetic Algorithm for Curriculum Design in Multi-Agent Reinforcement Learning},\nauthor={Yeeho Song and Jeff Schneider},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2CScZqkUPZ}\n}", "github": "https://github.com/yeehos/GEnetic-Multiagent-Selfplay", "project": "", "reviewers": "1f7s;FqYm;7Pcd;jtXR", "site": "https://openreview.net/forum?id=2CScZqkUPZ", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "5;5;4;4", "rating_avg": 3.0, "confidence_avg": 4.5, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": -0.7071067811865476, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10010759761963343218&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "2LLu3gavF1", "title": "Robot See Robot Do: Imitating Articulated Object Manipulation with Monocular 4D Reconstruction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humans can learn to manipulate new objects by simply watching others; providing robots with the ability to learn from such demonstrations would enable a natural interface specifying new behaviors. This work develops Robot See Robot Do (RSRD), a method for imitating articulated object manipulation from a single monocular RGB human demonstration given a single static multi- view object scan. We first propose 4D Differentiable Part Models (4D-DPM), a method for recovering 3D part motion from a monocular video with differentiable rendering. This analysis-by-synthesis approach uses part-centric feature fields in an iterative optimization which enables the use of geometric regularizers to re- cover 3D motions from only a single video. Given this 4D reconstruction, the robot replicates object trajectories by planning bimanual arm motions that induce the demonstrated object part motion. By representing demonstrations as part- centric trajectories, RSRD focuses on replicating the demonstration\u2019s intended behavior while considering the robot\u2019s own morphological limits, rather than at- tempting to reproduce the hand\u2019s motion. We evaluate 4D-DPM\u2019s 3D tracking accuracy on ground truth annotated 3D part trajectories and RSRD\u2019s physical ex- ecution performance on 9 objects across 10 trials each on a bimanual YuMi robot. Each phase of RSRD achieves an average of 87% success rate, for a total end- to-end success rate of 60% across 90 trials. Notably, this is accomplished using only feature fields distilled from large pretrained vision models \u2014 without any task-specific training, fine-tuning, dataset collection, or annotation. Project page: https://robot-see-robot-do.github.io", "keywords": "Feature Fields;Visual Imitation;Grasping;Articulated Objects", "primary_area": "", "supplementary_material": "/attachment/eff56165158d01ceca747ddfa925d7059e0c96a2.zip", "author": "Justin Kerr;Chung Min Kim;Mingxuan Wu;Brent Yi;Qianqian Wang;Ken Goldberg;Angjoo Kanazawa", "authorids": "~Justin_Kerr1;~Chung_Min_Kim1;~Mingxuan_Wu1;~Brent_Yi1;~Qianqian_Wang2;~Ken_Goldberg1;~Angjoo_Kanazawa1", "gender": "M;;M;M;F;M;F", "homepage": "https://kerrj.github.io/;https://chungmin99.github.io/;https://mingxuan-w.github.io/website/;;https://www.cs.cornell.edu/~qqw/;http://goldberg.berkeley.edu/;https://people.eecs.berkeley.edu/~kanazawa/", "dblp": ";305/3515;;239/5167;118/6735-2;g/KennethYGoldberg;119/1305", "google_scholar": ";ODr5lMgAAAAJ;https://scholar.google.ca/citations?user=gcj2V7IAAAAJ;https://scholar.google.com/citations?hl=en;VdmfIeUAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ;Ci-_QYIAAAAJ", "orcid": ";;;;;0000-0001-6747-9499;", "linkedin": ";;;;;goldbergken/;", "or_profile": "~Justin_Kerr1;~Chung_Min_Kim1;~Mingxuan_Wu1;~Brent_Yi1;~Qianqian_Wang2;~Ken_Goldberg1;~Angjoo_Kanazawa1", "aff": "University of California, Berkeley;University of California, Berkeley;Xi'an Jiaotong University;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;xjtu.edu;berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;Undergrad student;PhD student;Postdoc;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkerr2024robot,\ntitle={Robot See Robot Do: Imitating Articulated Object Manipulation with Monocular 4D Reconstruction},\nauthor={Justin Kerr and Chung Min Kim and Mingxuan Wu and Brent Yi and Qianqian Wang and Ken Goldberg and Angjoo Kanazawa},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2LLu3gavF1}\n}", "github": "https://github.com/kerrj/rsrd", "project": "", "reviewers": "rPHN;btbD;P34h", "site": "https://openreview.net/forum?id=2LLu3gavF1", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12710598611497962274&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;0;0;0", "aff_unique_norm": "University of California, Berkeley;Xi'an Jiao Tong University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.xjtu.edu.cn", "aff_unique_abbr": "UC Berkeley;XJTU", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "2SYFDG4WRA", "title": "Manipulate-Anything: Automating Real-World Robots using Vision-Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large-scale endeavors like RT-1 and widespread community efforts such as Open-X-Embodiment have contributed to growing the scale of robot demonstration data. However, there is still an opportunity to improve the quality, quantity, and diversity of robot demonstration data. Although vision-language models have been shown to automatically generate demonstration data, their utility has been limited to environments with privileged state information, they require hand-designed skills, and are limited to interactions with few object instances. We propose Manipulate-Anything, a scalable automated generation method for real-world robotic manipulation.\nUnlike prior work, our method can operate in real-world environments without any privileged state information, hand-designed skills, and can manipulate any static object. We evaluate our method using two setups. First, Manipulate-Anything successfully generates trajectories for all 5 real-world and 12 simulation tasks, significantly outperforming existing methods like VoxPoser. \nSecond, Manipulate-Anything's demonstrations can train more robust behavior cloning policies than training with human demonstrations, or from data generated by VoxPoser and Code-As-Policies.\nWe believe Manipulate-Anything can be the scalable method for both generating data for robotics and solving novel tasks in a zero-shot setting. Anonymous project page: manipulate-anything.github.io.", "keywords": "Robot Learning; Multimodal Large Language Model; Data Generation; Imitation Learning; Behavior Cloning", "primary_area": "", "supplementary_material": "/attachment/12563b4e01eee0d581683b9bfa185910d242b1da.zip", "author": "Jiafei Duan;Wentao Yuan;Wilbert Pumacay;Yi Ru Wang;Kiana Ehsani;Dieter Fox;Ranjay Krishna", "authorids": "~Jiafei_Duan1;~Wentao_Yuan1;~Wilbert_Pumacay1;~Yi_Ru_Wang1;~Kiana_Ehsani1;~Dieter_Fox1;~Ranjay_Krishna1", "gender": "M;M;M;;F;M;M", "homepage": "https://duanjiafei.com/;https://wentaoyuan.github.io;https://wpumacay.github.io;;https://ehsanik.github.io/;https://homes.cs.washington.edu/~fox/;http://ranjaykrishna.com", "dblp": "275/9973.html;225/4795.html;;302/0208;198/0910;f/DieterFox;167/3785", "google_scholar": "d1WCSJIAAAAJ;PZZZG6YAAAAJ;;OTL-u30AAAAJ;RScZCLEAAAAJ;DqXsbPAAAAAJ;IcqahyAAAAAJ", "orcid": ";0000-0002-3836-8877;;;;;0000-0001-8784-2531", "linkedin": "jiafei-duan-a69b11112/;;;yi-ru-helen-wang/;kiana-ehsani-1b81b0162/;;ranjay-krishna-1a344444/", "or_profile": "~Jiafei_Duan1;~Wentao_Yuan1;~Wilbert_Pumacay1;~Yi_Ru_Wang1;~Kiana_Ehsani1;~Dieter_Fox1;~Ranjay_Krishna1", "aff": "NVIDIA;University of Washington, Seattle;Universidad Nacional de Ingenier\u00eda;University of Washington;Allen Institute for Artificial Intelligence;Department of Computer Science;University of Washington", "aff_domain": "nvidia.com;uw.edu;uni.edu.pe;washington.edu;allenai.org;cs.washington.edu;cs.washington.edu", "position": "Intern;PhD student;Researcher;PhD student;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nduan2024manipulateanything,\ntitle={Manipulate-Anything: Automating Real-World Robots using Vision-Language Models},\nauthor={Jiafei Duan and Wentao Yuan and Wilbert Pumacay and Yi Ru Wang and Kiana Ehsani and Dieter Fox and Ranjay Krishna},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2SYFDG4WRA}\n}", "github": "https://github.com/Robot-MA/manipulate-anything/tree/main", "project": "", "reviewers": "Ssao;3U2k;1Hnu", "site": "https://openreview.net/forum?id=2SYFDG4WRA", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;3;5", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.5, "gs_citation": 39, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16206908892512536184&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;1;3;4;1", "aff_unique_norm": "NVIDIA;University of Washington;Universidad Nacional de Ingenier\u00eda;Allen Institute for Artificial Intelligence;Unknown Institution", "aff_unique_dep": "NVIDIA Corporation;;;;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;https://www.washington.edu;https://www.uni.edu.pe;https://allenai.org;", "aff_unique_abbr": "NVIDIA;UW;UNI;AI2;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;Peru;" }, { "id": "2sg4PY1W9d", "title": "Learning Transparent Reward Models via Unsupervised Feature Selection", "track": "main", "status": "Poster", "tldr": "", "abstract": "In complex real-world tasks such as robotic manipulation and autonomous driving, collecting expert demonstrations is often more straightforward than specifying precise learning objectives and task descriptions. Learning from expert data can be achieved through behavioral cloning or by learning a reward function, i.e., inverse reinforcement learning. The latter allows for training with additional data outside the training distribution, guided by the inferred reward function. We propose a novel approach to construct compact and interpretable reward models from automatically selected state features. These inferred rewards have an explicit form and enable the learning of policies that closely match expert behavior by training standard reinforcement learning algorithms from scratch. We validate our method's performance in various robotic environments with continuous and high-dimensional state spaces.", "keywords": "Inverse reinforcement learning;Reinforcement learning;Imitation learning;Robots;Reward learning;Robot learning", "primary_area": "", "supplementary_material": "/attachment/12632391c43ee33427cf74813a3411073b2b99b0.zip", "author": "Daulet Baimukashev;Gokhan Alcan;Kevin Sebastian Luck;Ville Kyrki", "authorids": "~Daulet_Baimukashev1;~Gokhan_Alcan1;~Kevin_Sebastian_Luck1;~Ville_Kyrki1", "gender": "M;M;;", "homepage": ";https://www.gokhanalcan.com/;;https://irobotics.aalto.fi", "dblp": "232/9856.html;;153/7680;07/2806", "google_scholar": "pI4K5z4AAAAJ;https://scholar.google.com.tr/citations?user=XSVNKtwAAAAJ;;8OBnyXQAAAAJ", "orcid": "0000-0002-1432-8205;0000-0003-3025-5883;;", "linkedin": "daulet-baimukashev/;gokhanalcan/;;", "or_profile": "~Daulet_Baimukashev1;~Gokhan_Alcan1;~Kevin_Sebastian_Luck1;~Ville_Kyrki1", "aff": "Aalto University;Aalto University;Vrije Universiteit Amsterdam;Aalto University", "aff_domain": "aalto.fi;aalto.fi;vu.nl;aalto.fi", "position": "PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nbaimukashev2024learning,\ntitle={Learning Transparent Reward Models via Unsupervised Feature Selection},\nauthor={Daulet Baimukashev and Gokhan Alcan and Kevin Sebastian Luck and Ville Kyrki},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=2sg4PY1W9d}\n}", "github": "https://github.com/baimukashev/reward-learning", "project": "", "reviewers": "RAMo;rzLp;psPt", "site": "https://openreview.net/forum?id=2sg4PY1W9d", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Oi_nYzYVVUgJ:scholar.google.com/&scioq=Learning+Transparent+Reward+Models+via+Unsupervised+Feature+Selection&hl=en&as_sdt=0,33", "gs_version_total": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Aalto University;Vrije Universiteit Amsterdam", "aff_unique_dep": ";", "aff_unique_url": "https://www.aalto.fi;https://www.vu.nl", "aff_unique_abbr": "Aalto;VU Amsterdam", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Finland;Netherlands" }, { "id": "3NI5SxsJqf", "title": "Accelerating Visual Sparse-Reward Learning with Latent Nearest-Demonstration-Guided Explorations", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent progress in deep reinforcement learning (RL) and computer vision enables artificial agents to solve complex tasks, including locomotion, manipulation, and video games from high-dimensional pixel observations. However, RL usually relies on domain-specific reward functions for sufficient learning signals, requiring expert knowledge. While vision-based agents could learn skills from only sparse rewards, exploration challenges arise. We present Latent Nearest-demonstration-guided Exploration (LaNE), a novel and efficient method to solve sparse-reward robot manipulation tasks from image observations and a few demonstrations. First, LaNE builds on the pre-trained DINOv2 feature extractor to learn an embedding space for forward prediction. Next, it rewards the agent for exploring near the demos, quantified by quadratic control costs in the embedding space. Finally, LaNE optimizes the policy for the augmented rewards with RL. Experiments demonstrate that our method achieves state-of-the-art sample efficiency in Robosuite simulation and enables under-an-hour RL training from scratch on a Franka Panda robot, using only a few demonstrations.", "keywords": "Computer Vision;Sparse Reward;RL from Demonstrations", "primary_area": "", "supplementary_material": "/attachment/ec7e7bbc79505c3368ffdcc9228b23303a131063.zip", "author": "Ruihan Zhao;ufuk topcu;Sandeep P. Chinchali;Mariano Phielipp", "authorids": "~Ruihan_Zhao1;~ufuk_topcu1;~Sandeep_P._Chinchali1;~Mariano_Phielipp2", "gender": "M;Unspecified;M;", "homepage": "https://philipzrh.com;https://autonomy.oden.utexas.edu/;https://www.intel.com/content/www/us/en/research/researchers/mariano-phielipp.html;https://www.ece.utexas.edu/people/faculty/sandeep-chinchali", "dblp": "236/4741-1;12/6659.html;23/4518;85/8366", "google_scholar": ";jeNGFfQAAAAJ;YArRsvEAAAAJ;262ASa4AAAAJ", "orcid": ";0000-0003-0819-9985;;", "linkedin": ";;mariano-phielipp-941624;", "or_profile": "~Ruihan_Zhao1;~ufuk_topcu1;~Mariano_Phielipp2;~Sandeep_Chinchali1", "aff": "University of Texas at Austin;University of Texas, Austin;Intel Labs;University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;intel.com;utexas.edu", "position": "PhD student;Full Professor;Principal Researcher;Assistant Professor", "bibtex": "@inproceedings{\nzhao2024accelerating,\ntitle={Accelerating Visual Sparse-Reward Learning with Latent Nearest-Demonstration-Guided Explorations},\nauthor={Ruihan Zhao and ufuk topcu and Sandeep P. Chinchali and Mariano Phielipp},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3NI5SxsJqf}\n}", "github": "", "project": "", "reviewers": "qY4v;9FLB;ic1h", "site": "https://openreview.net/forum?id=3NI5SxsJqf", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:6u1_WzqtS7wJ:scholar.google.com/&scioq=Accelerating+Visual+Sparse-Reward+Learning+with+Latent+Nearest-Demonstration-Guided+Explorations&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Texas at Austin;Intel", "aff_unique_dep": ";Intel Labs", "aff_unique_url": "https://www.utexas.edu;https://www.intel.com", "aff_unique_abbr": "UT Austin;Intel", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "3ZAgXBRvla", "title": "FlowBotHD: History-Aware Diffuser Handling Ambiguities in Articulated Objects Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce a novel approach to manipulate articulated objects with ambiguities, such as opening a door, in which multi-modality and occlusions create ambiguities about the opening side and direction. Multi-modality occurs when the method to open a fully closed door (push, pull, slide) is uncertain, or the side from which it should be opened is uncertain. Occlusions further obscure the door\u2019s shape from certain angles, creating further ambiguities during the occlusion. To tackle these challenges, we propose a history-aware diffusion network that models the multi-modal distribution of the articulated object and uses history to disambiguate actions and make stable predictions under occlusions. Experiments and analysis demonstrate the state-of-art performance of our method and specifically improvements in ambiguity-caused failure modes. Our project website is available at https://flowbothd.github.io/.", "keywords": "Ambiguity;Multi-modality;Occlusion;Articulated Objects;Diffusion", "primary_area": "", "supplementary_material": "/attachment/2bb523f3afa03f2bf298fa1508ce5f3e58c195dc.zip", "author": "Yishu Li;Wen Hui Leng;Yiming Fang;Ben Eisner;David Held", "authorids": "~Yishu_Li1;~Wen_Hui_Leng1;~Yiming_Fang1;~Ben_Eisner1;~David_Held1", "gender": "F;;F;M;M", "homepage": "https://liy1shu.github.io/;;;;http://davheld.github.io/", "dblp": "247/2570;;;;22/11147", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;;;RWe-v0UAAAAJ;0QtU-NsAAAAJ", "orcid": ";;;;", "linkedin": ";leng-wen-hui;yiming-fang-6869a2141/;;", "or_profile": "~Yishu_Li1;~Wen_Hui_Leng1;~Yiming_Fang1;~Ben_Eisner1;~David_Held1", "aff": "Tsinghua University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cs.tsinghua.edu.cn;cmu.edu;andrew.cmu.edu;cmu.edu;cmu.edu", "position": "Undergrad student;Undergrad student;Undergrad student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nli2024flowbothd,\ntitle={FlowBot{HD}: History-Aware Diffuser Handling Ambiguities in Articulated Objects Manipulation},\nauthor={Yishu Li and Wen Hui Leng and Yiming Fang and Ben Eisner and David Held},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3ZAgXBRvla}\n}", "github": "", "project": "", "reviewers": "aMTY;niS8;LDDR", "site": "https://openreview.net/forum?id=3ZAgXBRvla", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:SkerbnYcZOcJ:scholar.google.com/&scioq=FlowBotHD:+History-Aware+Diffuser+Handling+Ambiguities+in+Articulated+Objects+Manipulation&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Tsinghua University;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.cmu.edu", "aff_unique_abbr": "THU;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "3bcujpPikC", "title": "FREA: Feasibility-Guided Generation of Safety-Critical Scenarios with Reasonable Adversariality", "track": "main", "status": "Poster", "tldr": "", "abstract": "Generating safety-critical scenarios, which are essential yet difficult to collect at scale, offers an effective method to evaluate the robustness of autonomous vehicles (AVs). Existing methods focus on optimizing adversariality while preserving the naturalness of scenarios, aiming to achieve a balance through data-driven approaches. However, without an appropriate upper bound for adversariality, the scenarios might exhibit excessive adversariality, potentially leading to unavoidable collisions. In this paper, we introduce FREA, a novel safety-critical scenarios generation method that incorporates the Largest Feasible Region (LFR) of AV as guidance to ensure the reasonableness of the adversarial scenarios. Concretely, FREA initially pre-calculates the LFR of AV from offline datasets. Subsequently, it learns a reasonable adversarial policy that controls critical background vehicles (CBVs) in the scene to generate adversarial yet AV-feasible scenarios by maximizing a novel feasibility-dependent objective function. Extensive experiments illustrate that FREA can effectively generate safety-critical scenarios, yielding considerable near-miss events while ensuring AV's feasibility. Generalization analysis also confirms the robustness of FREA in AV testing across various surrogate AV methods and traffic environments.", "keywords": "Feasibility;Scenario Generation;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/3c0021e3c9f3fe3c4cd6196df2ec396baeca02a0.zip", "author": "Keyu Chen;Yuheng Lei;Hao Cheng;Haoran Wu;Wenchao Sun;Sifa Zheng", "authorids": "~Keyu_Chen5;~Yuheng_Lei1;~Hao_Cheng21;~Haoran_Wu9;~Wenchao_Sun1;~Sifa_Zheng1", "gender": "M;M;M;M;M;M", "homepage": "https://currychen77.github.io/;https://sites.google.com/view/yuhenglei;https://wuhaoran111.github.io/;;http://www.svm.tsinghua.edu.cn/essay/80/1835.html;", "dblp": ";312/6546.html;19/4036;;;", "google_scholar": "m_bC1VAAAAAJ;;5hmsPUYAAAAJ;https://scholar.google.com/citations?hl=en;;kBH-iGsAAAAJ", "orcid": "0000-0001-6169-4710;0009-0006-1940-3573;;0000-0003-2969-4096;0000-0001-5160-1365;0009-0009-9401-2937", "linkedin": ";yuhenglei;;;;", "or_profile": "~Keyu_Chen5;~Yuheng_Lei1;~Haoran_Wu9;~Wenchao_Sun1;~Sifa_Zheng1;~\u7a0b\u6d691", "aff": "Tsinghua University;The University of Hong Kong;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;connect.hku.hk;mail.tsinghua.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;PhD student;Full Professor;PhD student", "bibtex": "@inproceedings{\nchen2024frea,\ntitle={{FREA}: Feasibility-Guided Generation of Safety-Critical Scenarios with Reasonable Adversariality},\nauthor={Keyu Chen and Yuheng Lei and Hao Cheng and Haoran Wu and Wenchao Sun and Sifa Zheng},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3bcujpPikC}\n}", "github": "https://github.com/CurryChen77/FREA", "project": "", "reviewers": "2ukq;B6vA;oVnR", "site": "https://openreview.net/forum?id=3bcujpPikC", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16869810077653555102&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0;0", "aff_unique_norm": "Tsinghua University;University of Hong Kong", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.hku.hk", "aff_unique_abbr": "THU;HKU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "3i7j8ZPnbm", "title": "UMI-on-Legs: Making Manipulation Policies Mobile with Manipulation-Centric Whole-body Controllers", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce UMI-on-Legs, a new framework that combines real-world and simulation data for quadruped manipulation systems. We scale task-centric data collection in the real world using a handheld gripper (UMI), providing a cheap way to demonstrate task-relevant manipulation skills without a robot. Simultaneously, we scale robot-centric data in simulation by training a whole-body controller. The interface between these two policies are end-effector trajectories in the task-frame, which are inferred by the manipulation policy and passed to the whole-body controller for tracking. We evaluate UMI-on-Legs on prehensile, non-prehensile, and dynamic manipulation tasks, and report over 70% success rate for all tasks. Lastly, we also demonstrate the zero-shot cross-embodiment deployment of a pre-trained manipulation policy checkpoint from a prior work, originally intended for a fixed-base robot arm, on our quadruped system. We believe this framework provides a scalable path towards learning expressive manipulation skills on dynamic robot embodiments.", "keywords": "Manipulation;Visuo-motor Policy;Whole-body Controller", "primary_area": "", "supplementary_material": "/attachment/6481d8b227d2365719a25d9f3ca5370a69ab1482.zip", "author": "Huy Ha;Yihuai Gao;Zipeng Fu;Jie Tan;Shuran Song", "authorids": "~Huy_Ha1;~Yihuai_Gao1;~Zipeng_Fu1;~Jie_Tan1;~Shuran_Song3", "gender": "M;M;M;M;F", "homepage": "https://www.cs.columbia.edu/~huy/;;https://zipengfu.github.io;http://www.jie-tan.net;https://shurans.github.io/", "dblp": "277/9554;;245/1504;81/7419;", "google_scholar": "-3-f_8YAAAAJ;;wMcPTbEAAAAJ;neGbgzYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";yihuai-gao-167711245/;zipengfu;jie-tan/;", "or_profile": "~Huy_Ha1;~Yihuai_Gao1;~Zipeng_Fu1;~Jie_Tan1;~Shuran_Song3", "aff": "Columbia University;Stanford University;Stanford University;Google;Stanford University", "aff_domain": "columbia.edu;stanford.edu;stanford.edu;google.com;stanford.edu", "position": "PhD student;PhD student;PhD student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nha2024umionlegs,\ntitle={{UMI}-on-Legs: Making Manipulation Policies Mobile with Manipulation-Centric Whole-body Controllers},\nauthor={Huy Ha and Yihuai Gao and Zipeng Fu and Jie Tan and Shuran Song},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3i7j8ZPnbm}\n}", "github": "https://github.com/real-stanford/umi-on-legs", "project": "", "reviewers": "uNw1;H1aK;ZZoE", "site": "https://openreview.net/forum?id=3i7j8ZPnbm", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14318772117681815721&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;1", "aff_unique_norm": "Columbia University;Stanford University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.columbia.edu;https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Columbia;Stanford;Google", "aff_campus_unique_index": "1;1;2;1", "aff_campus_unique": ";Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "3jNEz3kUSl", "title": "PointPatchRL - Masked Reconstruction Improves Reinforcement Learning on Point Clouds", "track": "main", "status": "Poster", "tldr": "", "abstract": "Perceiving the environment via cameras is crucial for Reinforcement Learning (RL) in robotics. While images are a convenient form of representation, they often complicate extracting important geometric details, especially with varying geometries or deformable objects. In contrast, point clouds naturally represent this geometry and easily integrate color and positional data from multiple camera views. However, while point-cloud processing with deep learning has seen many recent successes, RL on point clouds is under-researched, with only the simplest encoder architecture considered in the literature. We introduce PointPatchRL (PPRL), a method for RL on point clouds that builds on the common paradigm of dividing point clouds into overlapping patches, tokenizing them, and processing\nthe tokens with transformers. PPRL provides significant improvements compared with other point-cloud processing architectures previously used for RL. We then complement PPRL with masked reconstruction for representation learning and show that our method outperforms strong model-free and model-based baselines on image observations in complex manipulation tasks containing deformable objects and variations in target object geometry.", "keywords": "Point Clouds;Self-Supervised Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/44302b70e61f4aed693ec23656aef478724d0b57.zip", "author": "Balazs Gyenes;Nikolai Franke;Philipp Becker;Gerhard Neumann", "authorids": "~Balazs_Gyenes1;~Nikolai_Franke1;~Philipp_Becker1;~Gerhard_Neumann2", "gender": "M;;M;M", "homepage": "https://alr.iar.kit.edu/21_527.php;https://github.com/nikolai-franke;;https://alr.anthropomatik.kit.edu/", "dblp": "304/2555;;66/1316;60/4878", "google_scholar": "5ZtsRSgAAAAJ;;https://scholar.google.de/citations?user=jXx-LuQAAAAJ;https://scholar.google.com.tw/citations?user=GL360kMAAAAJ", "orcid": "0000-0002-4430-1820;;;", "linkedin": ";;;", "or_profile": "~Balazs_Gyenes1;~Nikolai_Franke1;~Philipp_Becker1;~Gerhard_Neumann1", "aff": "Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;FZI Forschungszentrum Informatik ;Karlsruhe Institute of Technology", "aff_domain": "kit.edu;kit.edu;fzi.de;kit.edu", "position": "PhD student;MS student;Researcher;Full Professor", "bibtex": "@inproceedings{\ngyenes2024pointpatchrl,\ntitle={PointPatch{RL} - Masked Reconstruction Improves Reinforcement Learning on Point Clouds},\nauthor={Balazs Gyenes and Nikolai Franke and Philipp Becker and Gerhard Neumann},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3jNEz3kUSl}\n}", "github": "https://github.com/balazsgyenes/pprl", "project": "", "reviewers": "eyuT;HVuz;Gb3L", "site": "https://openreview.net/forum?id=3jNEz3kUSl", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LcrvUitlzLUJ:scholar.google.com/&scioq=PointPatchRL+-+Masked+Reconstruction+Improves+Reinforcement+Learning+on+Point+Clouds&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;FZI Forschungszentrum Informatik;Karlsruhe Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kit.edu;https://www.fzi.de;https://www.kit.edu", "aff_unique_abbr": "KIT;FZI;KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "3wBqoPfoeJ", "title": "Twisting Lids Off with Two Hands", "track": "main", "status": "Poster", "tldr": "", "abstract": "Manipulating objects with two multi-fingered hands has been a long-standing challenge in robotics, due to the contact-rich nature of many manipulation tasks and the complexity inherent in coordinating a high-dimensional bimanual system. In this work, we share novel insights into physical modeling, real-time perception, and reward design that enable policies trained in simulation using deep reinforcement learning (RL) to be effectively and efficiently transferred to the real world. Specifically, we consider the problem of twisting lids of various bottle-like objects with two hands, demonstrating policies with generalization capabilities across a diverse set of unseen objects as well as dynamic and dexterous behaviors. To the best of our knowledge, this is the first sim-to-real RL system that enables such capabilities on bimanual multi-fingered hands.", "keywords": "Bimanual Manipulation;Sim-to-Real;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/545221b5d6ea058b57c6ccbdbead43544cbebe1a.zip", "author": "Toru Lin;Zhao-Heng Yin;Haozhi Qi;Pieter Abbeel;Jitendra Malik", "authorids": "~Toru_Lin1;~Zhao-Heng_Yin1;~Haozhi_Qi1;~Pieter_Abbeel2;~Jitendra_Malik2", "gender": ";;M;M;M", "homepage": ";;https://haozhi.io/;https://people.eecs.berkeley.edu/~pabbeel/;https://people.eecs.berkeley.edu/~malik/", "dblp": ";;190/7802;;58/2944", "google_scholar": ";;https://scholar.google.com.hk/citations?user=iyVHKkcAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;oY9R5YQAAAAJ", "orcid": ";;;;0000-0003-3695-1580", "linkedin": ";;;;", "or_profile": "~Toru_Lin1;~Zhao-Heng_Yin1;~Haozhi_Qi1;~Pieter_Abbeel2;~Jitendra_Malik2", "aff": ";;University of California, Berkeley;Covariant;University of California, Berkeley", "aff_domain": ";;berkeley.edu;covariant.ai;berkeley.edu", "position": ";;PhD student;Founder;Full Professor", "bibtex": "@inproceedings{\nlin2024twisting,\ntitle={Twisting Lids Off with Two Hands},\nauthor={Toru Lin and Zhao-Heng Yin and Haozhi Qi and Pieter Abbeel and Jitendra Malik},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=3wBqoPfoeJ}\n}", "github": "", "project": "", "reviewers": "qU69;DcnM;qvJe", "site": "https://openreview.net/forum?id=3wBqoPfoeJ", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15516888825205626426&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "University of California, Berkeley;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States;" }, { "id": "46SluHKoE9", "title": "Continuously Improving Mobile Manipulation with Autonomous Real-World RL", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present a fully autonomous real-world RL framework for mobile manipulation that can learn policies without extensive instrumentation or human supervision. This is enabled by 1) task-relevant autonomy, which guides exploration towards object interactions and prevents stagnation near goal states, 2) efficient policy learning by leveraging basic task knowledge in behavior priors, and 3) formulating generic rewards that combine human-interpretable semantic information with low-level, fine-grained observations. We demonstrate that our approach allows Spot robots to continually improve their performance on a set of four challenging mobile manipulation tasks, obtaining an average success rate of 80% across tasks, a 3-4 times improvement over existing approaches. Videos can be found at https://continual-mobile-manip.github.io/.", "keywords": "Continual Learning;Mobile Manipulation;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/2db38218795e642c7d6d39741976f20a0edaf37a.zip", "author": "Russell Mendonca;Emmanuel Panov;Bernadette Bucher;Jiuguang Wang;Deepak Pathak", "authorids": "~Russell_Mendonca1;epanov@theaiinstitute.com;~Bernadette_Bucher1;~Jiuguang_Wang1;~Deepak_Pathak1", "gender": "M;;F;;M", "homepage": "https://russellmendonca.github.io/;;http://bernadettekbucher.com;;https://www.cs.cmu.edu/~dpathak/", "dblp": "215/5062;;251/5461;;155/9860", "google_scholar": "Uly5spMAAAAJ;;VIZvaGsAAAAJ;;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;;;", "linkedin": ";;bernadette-bucher-09898536/;;pathak22/", "or_profile": "~Russell_Mendonca1;epanov@theaiinstitute.com;~Bernadette_Bucher1;~Jiuguang_Wang1;~Deepak_Pathak1", "aff": "Carnegie Mellon University;;Boston Dynamics AI Institute;;Carnegie Mellon University", "aff_domain": "cmu.edu;;theaiinstitute.com;;cmu.edu", "position": "PhD student;;Researcher;;Assistant Professor", "bibtex": "@inproceedings{\nmendonca2024continuously,\ntitle={Continuously Improving Mobile Manipulation with Autonomous Real-World {RL}},\nauthor={Russell Mendonca and Emmanuel Panov and Bernadette Bucher and Jiuguang Wang and Deepak Pathak},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=46SluHKoE9}\n}", "github": "", "project": "", "reviewers": "qcD5;of2F;N2Ar;zbZw", "site": "https://openreview.net/forum?id=46SluHKoE9", "pdf_size": 0, "rating": "3;3;3;3", "confidence": "4;3;4;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17386348096473122290&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Boston Dynamics AI Institute", "aff_unique_dep": ";AI Institute", "aff_unique_url": "https://www.cmu.edu;https://www.bostondynamics.com/", "aff_unique_abbr": "CMU;BD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "4Of4UWyBXE", "title": "RP1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual Dexterous Robot Hands", "track": "main", "status": "Poster", "tldr": "", "abstract": "Endowing robot hands with human-level dexterity is a long-lasting research objective. Bi-manual robot piano playing constitutes a task that combines challenges from dynamic tasks, such as generating fast while precise motions, with slower but contact-rich manipulation problems. Although reinforcement learning based approaches have shown promising results in single-task performance, these methods struggle in a multi-song setting. Our work aims to close this gap and, thereby, enable imitation learning approaches for robot piano playing at scale. To this end, we introduce the Robot Piano 1 Million (RP1M) dataset, containing bi-manual robot piano playing motion data of more than one million trajectories. We formulate finger placements as an optimal transport problem, thus, enabling automatic annotation of vast amounts of unlabeled songs. Benchmarking existing imitation learning approaches shows that such approaches reach state-of-the-art robot piano playing performance by leveraging RP1M.", "keywords": "Bi-manual dexterous robot hands;dataset for robot piano playing;imitation learning;robot learning at scale", "primary_area": "", "supplementary_material": "/attachment/e5b5cd7f0fc1a21be0d7d008c2eb99558b951dc6.zip", "author": "Yi Zhao;Le Chen;Jan Schneider;Quankai Gao;Juho Kannala;Bernhard Sch\u00f6lkopf;Joni Pajarinen;Dieter B\u00fcchler", "authorids": "~Yi_Zhao6;~Le_Chen3;~Jan_Schneider1;~Quankai_Gao1;~Juho_Kannala5;~Bernhard_Sch\u00f6lkopf1;~Joni_Pajarinen2;~Dieter_B\u00fcchler1", "gender": "M;;M;M;M;;;M", "homepage": "https://zhaoyi11.github.io/;;https://ei.is.mpg.de/person/jschneider;https://github.com/Zerg-Overmind;https://users.aalto.fi/~kannalj1/;;;http://embodied.ml/", "dblp": "51/4138-1;;;287/5063;47/4656.html;;23/8355;181/4076.html", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=de;;c4mWQPQAAAAJ;;https://scholar.google.fi/citations?user=-2fJStwAAAAJ;https://scholar.google.de/citations?user=8HYQ1tgAAAAJ", "orcid": "0009-0002-9979-595X;;0000-0001-8426-7981;;0000-0001-5088-4041;;0000-0003-4469-8191;", "linkedin": ";;;;;;;", "or_profile": "~Yi_Zhao6;~Le_Chen3;~Jan_Schneider1;~Quankai_Gao1;~Juho_Kannala5;~Bernhard_Sch\u00f6lkopf1;~Joni_Pajarinen2;~Dieter_B\u00fcchler1", "aff": "Max Planck Institute for Intelligent Systems;;Max Planck Institute for Intelligent Systems;University of Southern California;Aalto University;;Aalto University;Max Planck Institute for Intelligent Systems, Max-Planck Institute", "aff_domain": "mpg.tuebingen.de;;is.mpg.de;usc.edu;aalto.fi;;aalto.fi;tuebingen.mpg.de", "position": "Intern;;PhD student;PhD student;Associate Professor;;Assistant Professor;Group Leader", "bibtex": "@inproceedings{\nzhao2024rpm,\ntitle={{RP}1M: A Large-Scale Motion Dataset for Piano Playing with Bi-Manual Dexterous Robot Hands},\nauthor={Yi Zhao and Le Chen and Jan Schneider and Quankai Gao and Juho Kannala and Bernhard Sch{\\\"o}lkopf and Joni Pajarinen and Dieter B{\\\"u}chler},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=4Of4UWyBXE}\n}", "github": "", "project": "", "reviewers": "Kzd5;Sw5X;EzFs;3sJW;qnEm;cZsS;iGED", "site": "https://openreview.net/forum?id=4Of4UWyBXE", "pdf_size": 0, "rating": "3;3;3;3;3;3;3", "confidence": "5;3;4;4;4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11771391503354635670&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;2;0", "aff_unique_norm": "Max Planck Institute for Intelligent Systems;University of Southern California;Aalto University", "aff_unique_dep": "Intelligent Systems;;", "aff_unique_url": "https://www.mpi-is.mpg.de;https://www.usc.edu;https://www.aalto.fi", "aff_unique_abbr": "MPI-IS;USC;Aalto", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;0;1;2;2;0", "aff_country_unique": "Germany;United States;Finland" }, { "id": "55tYfHvanf", "title": "Bimanual Dexterity for Complex Tasks", "track": "main", "status": "Poster", "tldr": "", "abstract": "To train generalist robot policies, machine learning methods often require a substantial amount of expert human teleoperation data. An ideal robot for humans collecting data is one that closely mimics them: bimanual arms and dexterous hands. However, creating such a bimanual teleoperation system with over 50 DoF is a significant challenge. To address this, we introduce Bidex, an extremely dexterous, low-cost, low-latency and portable bimanual dexterous teleoperation system which relies on motion capture gloves and teacher arms. We compare Bidex to a Vision Pro teleoperation system and a SteamVR system and find Bidex to produce better quality data for more complex tasks at a faster rate. Additionally, we show Bidex operating a mobile bimanual robot for in the wild tasks. Please refer to https://bidex-teleop.github.io for video results and instructions to recreate Bidex. The robot hands (5k USD) and teleoperation system (7k USD) is readily reproducible and can be used on many robot arms including two xArms ($16k USD).", "keywords": "Dexterous Manipulation;Bimanual;Behavior Cloning", "primary_area": "", "supplementary_material": "/attachment/af087a100a66586612357fa1354351355862b729.zip", "author": "Kenneth Shaw;Yulong Li;Jiahui Yang;Mohan Kumar Srirama;Ray Liu;Haoyu Xiong;Russell Mendonca;Deepak Pathak", "authorids": "~Kenneth_Shaw1;~Yulong_Li1;jiahuiya@andrew.cmu.edu;~Mohan_Kumar_Srirama1;muxinl@andrew.cmu.edu;~Haoyu_Xiong3;~Russell_Mendonca1;~Deepak_Pathak1", "gender": "M;M;;M;;M;M;M", "homepage": "https://www.linkedin.com/in/kenny-shaw/;https://yulongli42.github.io;;https://www.mohansrirama.com;;https://haoyu-x.github.io/;https://russellmendonca.github.io/;https://www.cs.cmu.edu/~dpathak/", "dblp": ";;;;;;215/5062;155/9860", "google_scholar": ";;;;;;Uly5spMAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ", "orcid": ";;;;;;;", "linkedin": "kenny-shaw/;;;;;;;pathak22/", "or_profile": "~Kenneth_Shaw1;~Yulong_Li1;jiahuiya@andrew.cmu.edu;~Mohan_Kumar_Srirama1;muxinl@andrew.cmu.edu;~Haoyu_Xiong3;~Russell_Mendonca1;~Deepak_Pathak1", "aff": "Carnegie Mellon University;Carnegie Mellon University;;Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;;cmu.edu;;andrew.cmu.edu;cmu.edu;cmu.edu", "position": "PhD student;MS student;;Researcher;;MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nshaw2024bimanual,\ntitle={Bimanual Dexterity for Complex Tasks},\nauthor={Kenneth Shaw and Yulong Li and Jiahui Yang and Mohan Kumar Srirama and Ray Liu and Haoyu Xiong and Russell Mendonca and Deepak Pathak},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=55tYfHvanf}\n}", "github": "", "project": "", "reviewers": "kK4n;vkX4;ew2f", "site": "https://openreview.net/forum?id=55tYfHvanf", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;3;2", "rating_avg": 3.0, "confidence_avg": 2.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5317027727581613213&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "56IzghzjfZ", "title": "IMAGINATION POLICY: Using Generative Point Cloud Models for Learning Manipulation Policies", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humans can imagine goal states during planning and perform actions to match those goals. In this work, we propose IMAGINATION POLICY, a novel multi-task key-frame policy network for solving high-precision pick and place tasks. Instead of learning actions directly, IMAGINATION POLICY generates point clouds to imagine desired states which are then translated to actions using rigid action estimation. This transforms action inference into a local generative task. We leverage pick and place symmetries underlying the tasks in the generation process and achieve extremely high sample efficiency and generalizability to unseen configurations. Finally, we demonstrate state-of-the-art performance across various tasks on the RLbench benchmark compared with several strong baselines and validate our approach on a real robot.", "keywords": "Manipulation policy learning;Generative model;Geometric learning", "primary_area": "", "supplementary_material": "/attachment/dcc041c26760108bf00fa06b749b7505de3362cc.zip", "author": "Haojie Huang;Karl Schmeckpeper;Dian Wang;Ondrej Biza;Yaoyao Qian;Haotian Liu;Mingxi Jia;Robert Platt;Robin Walters", "authorids": "~Haojie_Huang1;~Karl_Schmeckpeper1;~Dian_Wang1;~Ondrej_Biza1;~Yaoyao_Qian1;~Haotian_Liu6;~Mingxi_Jia1;~Robert_Platt1;~Robin_Walters1", "gender": "M;;M;M;F;M;M;;M", "homepage": "https://haojhuang.github.io/;https://sites.google.com/view/karlschmeckpeper/;https://pointw.github.io/;https://sites.google.com/view/obiza;https://h-freax.github.io;https://andyliu7081.github.io/;https://saulbatman.github.io/;http://www.ccs.neu.edu/home/rplatt/;http://www.robinwalters.com", "dblp": "144/2195;245/5630;191/1369-1;230/8616.html;380/7236;;315/4688;39/5434;258/3416", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;E2kpqtkAAAAJ;CckjtfQAAAAJ;Gi9Xq8YAAAAJ;E0rCXLIAAAAJ;kHsMnEYAAAAJ;1iNSPQIAAAAJ;Z4Y5S2oAAAAJ;fnprJmUAAAAJ", "orcid": ";0000-0003-4989-2022;;0000-0003-3390-8050;;0009-0006-5770-6030;;;", "linkedin": ";;dianwang1007;ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;;;https://www.linkedin.com/mwlite/in/mingxi-jia-6997b9183;;", "or_profile": "~Haojie_Huang1;~Karl_Schmeckpeper1;~Dian_Wang1;~Ondrej_Biza1;~Yaoyao_Qian1;~Haotian_Liu6;~Mingxi_Jia1;~Robert_Platt1;~Robin_Walters1", "aff": "Northeastern University;The Robotics and AI Institute;Northeastern University;Northeastern University;Northeastern University;Northeastern University;Brown University;Northeastern University;Northeastern University ", "aff_domain": "northeastern.edu;theaiinstitute.com;northeastern.edu;northeastern.edu;neu.edu;northeastern.edu;brown.edu;neu.edu;northeastern.edu", "position": "PhD student;Researcher;PhD student;PhD student;MS student;Intern;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024imagination,\ntitle={{IMAGINATION} {POLICY}: Using Generative Point Cloud Models for Learning Manipulation Policies},\nauthor={Haojie Huang and Karl Schmeckpeper and Dian Wang and Ondrej Biza and Yaoyao Qian and Haotian Liu and Mingxi Jia and Robert Platt and Robin Walters},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=56IzghzjfZ}\n}", "github": "", "project": "", "reviewers": "1GT1;1BKy;U3RP", "site": "https://openreview.net/forum?id=56IzghzjfZ", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=739399138957187396&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;0;0;0;0;2;0;0", "aff_unique_norm": "Northeastern University;Robotics and AI Institute;Brown University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.northeastern.edu;;https://www.brown.edu", "aff_unique_abbr": "NEU;;Brown", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "5Awumz1VKU", "title": "Learning Differentiable Tensegrity Dynamics using Graph Neural Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Tensegrity robots are composed of rigid struts and flexible cables. They constitute an emerging class of hybrid rigid-soft robotic systems and are promising systems for a wide array of applications, ranging from locomotion to assembly. They are difficult to control and model accurately, however, due to their compliance and high number of degrees of freedom. To address this issue, prior work has introduced a differentiable physics engine designed for tensegrity robots based on first principles. In contrast, this work proposes the use of graph neural networks to model contact dynamics over a graph representation of tensegrity robots, which leverages their natural graph-like cable connec- tivity between end caps of rigid rods. This learned simulator can accurately model 3-bar and 6-bar tensegrity robot dynamics in simulation-to-simulation experiments where MuJoCo is used as the ground truth. It can also achieve higher accuracy than the previous differentiable engine for a real 3-bar tensegrity robot, for which the robot state is only partially observable. When compared against direct applications of recent mesh-based graph neural network simulators, the proposed approach is computationally more efficient, both for training and inference, while achieving higher accuracy. Code and data are available at https://github.com/nchen9191/tensegrity_gnn_simulator_public", "keywords": "graph neural networks;differentiable simulation;tensegrity robots", "primary_area": "", "supplementary_material": "/attachment/4ad817067b11ab08286dc60d31e33afe4a1c10a5.zip", "author": "Nelson Chen;Kun Wang;William R. Johson III;Rebecca Kramer-Bottiglio;Kostas Bekris;Mridul Aanjaneya", "authorids": "~Nelson_Chen1;~Kun_Wang10;will.johnson@yale.edu;~Rebecca_Kramer-Bottiglio1;~Kostas_Bekris1;~Mridul_Aanjaneya3", "gender": "M;M;;;M;M", "homepage": ";;;https://www.eng.yale.edu/faboratory/;https://pracsys.cs.rutgers.edu/members/kostas-bekris/;https://orionquest.github.io/", "dblp": ";;;;42/170;81/5359.html", "google_scholar": ";83KxgOYAAAAJ;;;https://scholar.google.com.tw/citations?user=gwC7rCUAAAAJ;ZyaSau8AAAAJ", "orcid": ";;;;;0000-0002-5286-8173", "linkedin": "nchen9191/;;;;kostas-bekris-0b56794/;", "or_profile": "~Nelson_Chen1;~Kun_Wang10;will.johnson@yale.edu;~Rebecca_Kramer-Bottiglio1;~Kostas_Bekris1;~Mridul_Aanjaneya3", "aff": "Rutgers University;;;Yale University;Rutgers University;Rutgers University, New Brunswick", "aff_domain": "cs.rutgers.edu;;;yale.edu;rutgers.edu;rutgers.edu", "position": "PhD student;;;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024learning,\ntitle={Learning Differentiable Tensegrity Dynamics using Graph Neural Networks},\nauthor={Nelson Chen and Kun Wang and William R. Johson III and Rebecca Kramer-Bottiglio and Kostas Bekris and Mridul Aanjaneya},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5Awumz1VKU}\n}", "github": "https://github.com/nchen9191/tensegrity_gnn_simulator_public", "project": "", "reviewers": "uba8;mw3p;z5sV;Luga", "site": "https://openreview.net/forum?id=5Awumz1VKU", "pdf_size": 0, "rating": "3;3;3;3", "confidence": "4;4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:uf1zT5xXLbUJ:scholar.google.com/&scioq=Learning+Differentiable+Tensegrity+Dynamics+using+Graph+Neural+Networks&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Rutgers University;Yale University", "aff_unique_dep": ";", "aff_unique_url": "https://www.rutgers.edu;https://www.yale.edu", "aff_unique_abbr": "Rutgers;Yale", "aff_campus_unique_index": "1", "aff_campus_unique": ";New Brunswick", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "5W0iZR9J7h", "title": "DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale Synthetic Cluttered Scenes", "track": "main", "status": "Poster", "tldr": "", "abstract": "Grasping in cluttered scenes remains highly challenging for dexterous hands due to the scarcity of data. To address this problem, we present a large-scale synthetic dataset, encompassing 1319 objects, 8270 scenes, and 426 million grasps. Beyond benchmarking, we also explore data-efficient learning strategies from grasping data. We reveal that the combination of a conditional generative model that focuses on local geometry and a grasp dataset that emphasizes complex scene variations is key to achieving effective generalization. Our proposed generative method outperforms all baselines in simulation experiments. Furthermore, it demonstrates zero-shot sim-to-real transfer through test-time depth restoration, attaining 91% real-world success rate, showcasing the robust potential of utilizing fully synthetic training data.", "keywords": "Dexterous Grasping;Synthetic Data;Generative Models", "primary_area": "", "supplementary_material": "/attachment/c407edfba237de0891b457c270efdc906d33d610.zip", "author": "Jialiang Zhang;Haoran Liu;Danshi Li;XinQiang Yu;Haoran Geng;Yufei Ding;Jiayi Chen;He Wang", "authorids": "~Jialiang_Zhang2;~Haoran_Liu4;~Danshi_Li1;~XinQiang_Yu1;~Haoran_Geng1;~Yufei_Ding4;~Jiayi_Chen5;~He_Wang5", "gender": "M;;M;M;M;F;M;M", "homepage": "https://github.com/mzhmxzh;https://github.com/lhrrhl0419;https://danshi-li.github.io/personal-website/;;https://github.com/geng-haoran/haorangeng;https://selina2023.github.io/;https://hughw19.github.io;https://github.com/JYChen18", "dblp": ";;358/9014.html;;295/7112;;01/6368-10;42/1159-3", "google_scholar": ";;KJArKlgAAAAJ;;Inr-6rEAAAAJ;dGFdGMAAAAAJ;roCAWkoAAAAJ;", "orcid": ";;;0000-0001-8440-8975;;0009-0005-6219-2989;;", "linkedin": ";;;;haoran-geng-422778238/;;;", "or_profile": "~Jialiang_Zhang2;~Haoran_Liu4;~Danshi_Li1;~XinQiang_Yu1;~Haoran_Geng1;~Yufei_Ding4;~He_Wang5;~jiayi_chen3", "aff": "Peking University;Peking University;New York University;Institute of Computing Technology, Chinese Academy of Sciences;Peking University;Peking University;Peking University;Peking University", "aff_domain": "pku.edu.cn;pku.edu.cn;nyu.edu;ict.ac.cn;pku.edu.cn;stu.pku.edu.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Undergrad student;MS student;MS student;Undergrad student;Undergrad student;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nzhang2024dexgraspnet,\ntitle={DexGraspNet 2.0: Learning Generative Dexterous Grasping in Large-scale Synthetic Cluttered Scenes},\nauthor={Jialiang Zhang and Haoran Liu and Danshi Li and XinQiang Yu and Haoran Geng and Yufei Ding and Jiayi Chen and He Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5W0iZR9J7h}\n}", "github": "", "project": "", "reviewers": "2mB6;bSuk;AfH6", "site": "https://openreview.net/forum?id=5W0iZR9J7h", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17666609232437239068&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0;0;0;0", "aff_unique_norm": "Peking University;New York University;Chinese Academy of Sciences", "aff_unique_dep": ";;Institute of Computing Technology", "aff_unique_url": "http://www.pku.edu.cn;https://www.nyu.edu;http://www.ict.ac.cn", "aff_unique_abbr": "Peking U;NYU;CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0;0;0", "aff_country_unique": "China;United States" }, { "id": "5iXG6EgByK", "title": "Promptable Closed-loop Traffic Simulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Simulation stands as a cornerstone for safe and efficient autonomous driving development. At its core a simulation system ought to produce realistic, reactive, and controllable traffic patterns. In this paper, we propose ProSim, a multimodal promptable closed-loop traffic simulation framework. ProSim allows the user to give a complex set of numerical, categorical or textual prompts to instruct each agent\u2019s behavior and intention. ProSim then rolls out a traffic scenario in a closed-loop manner, modeling each agent\u2019s interaction with other traffic participants. Our experiments show that ProSim achieves high prompt controllability given different user prompts, while reaching competitive performance on the Waymo Sim Agents Challenge when no prompt is given. To support research on promptable traffic simulation, we create ProSim-Instruct-520k, a multimodal prompt-scenario paired driving dataset with over 10M text prompts for over 520k real-world driving scenarios. We will release data, benchmark, and labeling tools of ProSim-Instruct-520k upon publication.", "keywords": "Autonomous Driving;Scenario Generation;Traffic Simulation", "primary_area": "", "supplementary_material": "/attachment/421bf5a701bcf78e3f3886c30653cca835dc6011.zip", "author": "Shuhan Tan;Boris Ivanovic;Yuxiao Chen;Boyi Li;Xinshuo Weng;Yulong Cao;Philipp Kraehenbuehl;Marco Pavone", "authorids": "~Shuhan_Tan2;~Boris_Ivanovic1;~Yuxiao_Chen3;~Boyi_Li1;~Xinshuo_Weng3;~Yulong_Cao1;~Philipp_Kraehenbuehl1;~Marco_Pavone1", "gender": "M;;F;F;;M;M;M", "homepage": "https://ariostgx.github.io/website/;http://www.borisivanovic.com/;https://sites.google.com/site/boyilics/home;http://www.xinshuoweng.com;https://kikacaty.github.io/;http://www.philkr.net/;https://web.stanford.edu/~pavone/;https://research.nvidia.com/person/yuxiao-chen", "dblp": ";203/8356;;192/1952.html;207/6576;43/7592;91/3382-1.html;158/4934-1", "google_scholar": "Ro6enEEAAAAJ;ey9AQcEAAAAJ;;dthSEsoAAAAJ;uclqBzgAAAAJ;https://scholar.google.com.tw/citations?user=dzOd2hgAAAAJ;RhOpyXcAAAAJ;AOdxmJYAAAAJ", "orcid": ";0000-0002-8698-202X;;0000-0002-7894-4381;;;;0000-0001-5276-7156", "linkedin": ";boris-ivanovic-a3103064;;xinshuoweng;;;;", "or_profile": "~Shuhan_Tan2;~Boris_Ivanovic1;~Boyi_Li1;~Xinshuo_Weng3;~Yulong_Cao1;~Philipp_Kraehenbuehl1;~Marco_Pavone1;~Yuxiao_Chen2", "aff": "NVIDIA;NVIDIA;University of California, Berkeley;NVIDIA;NVIDIA;Apple;Stanford University;California Institute of Technology", "aff_domain": "nvidia.com;nvidia.com;berkeley.edu;nvidia.com;nvidia.com;apple.com;stanford.edu;caltech.edu", "position": "Research Intern;Researcher;Postdoc;Researcher;Researcher;Researcher;Associate Professor;Postdoc", "bibtex": "@inproceedings{\ntan2024promptable,\ntitle={Promptable Closed-loop Traffic Simulation},\nauthor={Shuhan Tan and Boris Ivanovic and Yuxiao Chen and Boyi Li and Xinshuo Weng and Yulong Cao and Philipp Kraehenbuehl and Marco Pavone},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5iXG6EgByK}\n}", "github": "", "project": "", "reviewers": "Vo5Q;w2z7;M9no", "site": "https://openreview.net/forum?id=5iXG6EgByK", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3609758759978015035&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;0;2;3;4", "aff_unique_norm": "NVIDIA;University of California, Berkeley;Apple;Stanford University;California Institute of Technology", "aff_unique_dep": "NVIDIA Corporation;;Apple Inc.;;", "aff_unique_url": "https://www.nvidia.com;https://www.berkeley.edu;https://www.apple.com;https://www.stanford.edu;https://www.caltech.edu", "aff_unique_abbr": "NVIDIA;UC Berkeley;Apple;Stanford;Caltech", "aff_campus_unique_index": "1;2;3", "aff_campus_unique": ";Berkeley;Stanford;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "5lSkn5v4LK", "title": "EquiGraspFlow: SE(3)-Equivariant 6-DoF Grasp Pose Generative Flows", "track": "main", "status": "Poster", "tldr": "", "abstract": "Traditional methods for synthesizing 6-DoF grasp poses from 3D observations often rely on geometric heuristics, resulting in poor generalizability, limited grasp options, and higher failure rates. Recently, data-driven methods have been proposed that use generative models to learn the distribution of grasp poses and generate diverse candidate poses. The main drawback of these methods is that they fail to achieve SE(3)-equivariance, meaning that the generated grasp poses do not transform correctly with object rotations and translations. In this paper, we propose \\textit{EquiGraspFlow}, a flow-based SE(3)-equivariant 6-DoF grasp pose generative model that can learn complex conditional distributions on the SE(3) manifold while guaranteeing SE(3)-equivariance. Our model achieves the equivariance without relying on data augmentation, by using network architectures that guarantee the equivariance by construction. Extensive experiments show that \\textit{EquiGraspFlow} accurately learns grasp pose distribution, achieves the SE(3)-equivariance, and significantly outperforms existing grasp pose generative models. Code is available at https://github.com/bdlim99/EquiGraspFlow.", "keywords": "6-DoF grasp pose generation;equivariance;generative models;continuous normalizing flows", "primary_area": "", "supplementary_material": "/attachment/9cdfab291b2cb9eafc55863f27dd67c9c91f772a.zip", "author": "Byeongdo Lim;Jongmin Kim;Jihwan Kim;Yonghyeon Lee;Frank C. Park", "authorids": "~Byeongdo_Lim1;~Jongmin_Kim7;~Jihwan_Kim2;~Yonghyeon_Lee2;~Frank_C._Park1", "gender": ";M;M;M;M", "homepage": "http://robot.snu.ac.kr;https://sites.google.com/robotics.snu.ac.kr/fcp/;http://robot.snu.ac.kr/;https://www.gabe-yhlee.com;http://robotics.snu.ac.kr", "dblp": "341/9568;;;182/6796;p/FrankChongwooPark", "google_scholar": ";;;;u-h3PJIAAAAJ", "orcid": ";;;;0000-0002-0293-6975", "linkedin": ";;;;", "or_profile": "~Byeongdo_Lim1;~Jongmin_Kim7;~Jihwan_Kim2;~Yonghyeon_Lee2;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;Seoul National University;Korea Institute for Advanced Study;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;snu.ac.kr;kias.re.kr;snu.ac.kr", "position": "PhD student;MS student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nlim2024equigraspflow,\ntitle={EquiGraspFlow: {SE}(3)-Equivariant 6-DoF Grasp Pose Generative Flows},\nauthor={Byeongdo Lim and Jongmin Kim and Jihwan Kim and Yonghyeon Lee and Frank C. Park},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5lSkn5v4LK}\n}", "github": "https://github.com/bdlim99/EquiGraspFlow", "project": "", "reviewers": "2vdQ;Qvx1;jkBE", "site": "https://openreview.net/forum?id=5lSkn5v4LK", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16865101928773692712&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Seoul National University;Korea Institute for Advanced Study", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;http://www.kaist.edu", "aff_unique_abbr": "SNU;KIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "5u9l6U61S7", "title": "GenSim2: Scaling Robot Data Generation with Multi-modal and Reasoning LLMs", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robotic simulation today remains challenging to scale up due to the human efforts required to create diverse simulation tasks and scenes. Simulation-trained policies also face scalability issues as many sim-to-real methods focus on a single task. To address these challenges, this work proposes GenSim2, a scalable framework that leverages coding LLMs with multi-modal and reasoning capabilities for complex and realistic simulation task creation, including long-horizon tasks with articulated objects. To automatically generate demonstration data for these tasks at scale, we propose planning and RL solvers that generalize within object categories. The pipeline can generate data for up to 100 articulated tasks with 200 objects and reduce the required human efforts. To utilize such data, we propose an effective multi-task language-conditioned policy architecture, dubbed proprioceptive point-cloud transformer (PPT), that learns from the generated demonstrations and exhibits strong sim-to-real zero-shot transfer. Combining the proposed pipeline and the policy architecture, we show a promising usage of GenSim2 that the generated data can be used for zero-shot transfer or co-train with real-world collected data, which enhances the policy performance by 20% compared with training exclusively on limited real data.", "keywords": "Generative Simulation; Robotics; Learning", "primary_area": "", "supplementary_material": "/attachment/a1a1de0aa3d62bc93c8b1b79831b04688abfce03.zip", "author": "Pu Hua;Minghuan Liu;Annabella Macaluso;Yunfeng Lin;Weinan Zhang;Huazhe Xu;Lirui Wang", "authorids": "~Pu_Hua1;~Minghuan_Liu1;~Annabella_Macaluso1;~Yunfeng_Lin1;~Weinan_Zhang1;~Huazhe_Xu1;~Lirui_Wang1", "gender": "M;M;F;M;M;M;M", "homepage": "https://piao-0429.github.io/;http://minghuanliu.com;https://annabellamacaluso.github.io/;https://github.com/creeperlin;http://wnzhang.net;http://hxu.rocks;https://liruiw.github.io/", "dblp": "331/5335;249/7554;;;28/10261-1;164/9006;221/9612", "google_scholar": "https://scholar.google.com/citations?hl=en;;yqqESloAAAAJ;;Qzss0GEAAAAJ;t9HPFawAAAAJ;EM9YhH0AAAAJ", "orcid": "0009-0008-1301-7131;;;;0000-0002-0127-2425;;", "linkedin": "https://www.linkedin.cn/incareer/in/pu-hua-315462215;;annabella-macaluso-93a023192/;;;;", "or_profile": "~Pu_Hua1;~Minghuan_Liu1;~Annabella_Macaluso1;~Yunfeng_Lin1;~Weinan_Zhang1;~Huazhe_Xu1;~Lirui_Wang1", "aff": "Electronic Engineering, Tsinghua University;Shanghai Jiaotong University;University of California, San Diego;;Shanghai Jiaotong University;Tsinghua University;Massachusetts Institute of Technology", "aff_domain": "mails.tsinghua.edu.cn;sjtu.edu.cn;ucsd.edu;;sjtu.edu.cn;tsinghua.edu.cn;mit.edu", "position": "Undergrad student;PhD student;MS student;;Associate Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nhua2024gensim,\ntitle={GenSim2: Scaling Robot Data Generation with Multi-modal and Reasoning {LLM}s},\nauthor={Pu Hua and Minghuan Liu and Annabella Macaluso and Yunfeng Lin and Weinan Zhang and Huazhe Xu and Lirui Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=5u9l6U61S7}\n}", "github": "https://github.com/GenSim2/GenSim2", "project": "", "reviewers": "9Mkg;Wb5Y;i12f", "site": "https://openreview.net/forum?id=5u9l6U61S7", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": -1.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10207369442489133621&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1;0;3", "aff_unique_norm": "Tsinghua University;Shanghai Jiao Tong University;University of California, San Diego;Massachusetts Institute of Technology", "aff_unique_dep": "Electronic Engineering;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sjtu.edu.cn;https://www.ucsd.edu;https://web.mit.edu", "aff_unique_abbr": "THU;SJTU;UCSD;MIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;1;0;0;1", "aff_country_unique": "China;United States" }, { "id": "67tTQeO4HQ", "title": "In-Flight Attitude Control of a Quadruped using Deep Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present the development and real world demonstration of an in-flight attitude control law for a small low-cost quadruped with a five-bar-linkage leg design using only its legs as reaction masses. The control law is trained using deep reinforcement learning (DRL) and specifically through Proximal Policy Optimization (PPO) in the NVIDIA Omniverse Isaac Sim simulator with a GPU-accelerated DRL pipeline. To demonstrate the policy, a small quadruped is designed, constructed, and evaluated both on a rotating pole test setup and in free fall. During a free fall of 0.7 seconds, the quadruped follows commanded attitude steps of 45 degrees in all principal axes, and achieves an average base angular velocity of 110 degrees per second during large attitude reference steps.", "keywords": "Deep Reinforcement Learning;Legged Robotics", "primary_area": "", "supplementary_material": "/attachment/f126c9ddcbbd1c51ad34e3ffa8c7ffb616d19de1.zip", "author": "Tarek El-Agroudi;Finn Gross Maurer;J\u00f8rgen Anker Olsen;Kostas Alexis", "authorids": "~Tarek_El-Agroudi2;~Finn_Gross_Maurer1;~J\u00f8rgen_Anker_Olsen1;~Kostas_Alexis1", "gender": "M;M;M;", "homepage": ";;;https://www.autonomousrobotslab.com/", "dblp": ";;;", "google_scholar": ";;;1rPSVNAAAAAJ", "orcid": ";;;0000-0002-9989-298X", "linkedin": "tarek-el-agroudi/;finn-maurer-5550ba156/;jorgenankerolsen/;kostas-alexis-67713918/", "or_profile": "~Tarek_El-Agroudi2;~Finn_Gross_Maurer1;~J\u00f8rgen_Anker_Olsen1;~Kostas_Alexis1", "aff": "Norwegian University of Science and Technology;Norwegian University of Science and Technology;Norwegian University of Science and Technology;Norwegian University of Science and Technology", "aff_domain": "ntnu.no;ntnu.no;ntnu.no;ntnu.no", "position": "MS student;MS student;PhD student;Full Professor", "bibtex": "@inproceedings{\nel-agroudi2024inflight,\ntitle={In-Flight Attitude Control of a Quadruped using Deep Reinforcement Learning},\nauthor={Tarek El-Agroudi and Finn Gross Maurer and J{\\o}rgen Anker Olsen and Kostas Alexis},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=67tTQeO4HQ}\n}", "github": "https://github.com/ntnu-arl/Eurepus-RL and https://github.com/ntnu-arl/Eurepus-design", "project": "", "reviewers": "dUvT;82XZ;LWaG", "site": "https://openreview.net/forum?id=67tTQeO4HQ", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14189661490880278072&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Norwegian University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ntnu.no", "aff_unique_abbr": "NTNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Norway" }, { "id": "6FGlpzC9Po", "title": "Steering Your Generalists: Improving Robotic Foundation Models via Value Guidance", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large, general-purpose robotic policies trained on diverse demonstration datasets have been shown to be remarkably effective both for controlling a variety of robots in a range of different scenes, and for acquiring broad repertoires of manipulation skills. However, the data that such policies are trained on is generally of mixed quality -- not only are human-collected demonstrations unlikely to perform the task perfectly, but the larger the dataset is, the harder it is to curate only the highest quality examples. It also remains unclear how optimal data from one embodiment is for training on another embodiment. In this paper, we present a general and broadly applicable approach that enhances the performance of such generalist robot policies at deployment time by re-ranking their actions according to a value function learned via offline RL. This approach, which we call Value-Guided Policy Steering (V-GPS), is compatible with a wide range of different generalist policies, without needing to fine-tune or even access the weights of the policy. We show that the same value function can improve the performance of five different state-of-the-art policies with different architectures, even though they were trained on distinct datasets, attaining consistent performance improvement on multiple robotic platforms across a total of 12 tasks. Code and videos can be found at: https://nakamotoo.github.io/V-GPS", "keywords": "generalist policies;value functions;robot reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/ca91a83d299762a4c35f72768603b794f5a1888e.zip", "author": "Mitsuhiko Nakamoto;Oier Mees;Aviral Kumar;Sergey Levine", "authorids": "~Mitsuhiko_Nakamoto1;~Oier_Mees1;~Aviral_Kumar2;~Sergey_Levine1", "gender": ";M;M;M", "homepage": "https://nakamotoo.github.io/;https://www.oiermees.com/;https://aviralkumar2907.github.io/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";190/8659;202/7961;80/7594", "google_scholar": "wIDVzroAAAAJ;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;;8R35rCwAAAAJ", "orcid": ";;;", "linkedin": ";oier-mees-a3069488;;", "or_profile": "~Mitsuhiko_Nakamoto1;~Oier_Mees1;~Aviral_Kumar2;~Sergey_Levine1", "aff": "University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;Google DeepMind;Google", "aff_domain": "eecs.berkeley.edu;eecs.berkeley.edu;google.com;google.com", "position": "PhD student;Postdoc;Researcher;Research Scientist", "bibtex": "@inproceedings{\nnakamoto2024steering,\ntitle={Steering Your Generalists: Improving Robotic Foundation Models via Value Guidance},\nauthor={Mitsuhiko Nakamoto and Oier Mees and Aviral Kumar and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6FGlpzC9Po}\n}", "github": "", "project": "", "reviewers": "jKak;Gdoc;AHDo", "site": "https://openreview.net/forum?id=6FGlpzC9Po", "pdf_size": 0, "rating": "2;2;3", "confidence": "4;4;4", "rating_avg": 2.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6596130003395981161&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.berkeley.edu;https://deepmind.com", "aff_unique_abbr": "UC Berkeley;DeepMind", "aff_campus_unique_index": "0;0;2", "aff_campus_unique": "Berkeley;;Mountain View", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "6X3ybeVpDi", "title": "Online Transfer and Adaptation of Tactile Skill: A Teleoperation Framework", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper presents a teleoperation framework designed for online learning and adaptation of tactile skills, which provides an intuitive interface without need for physical access to execution robot. The proposed tele-teaching approach utilizes periodical Dynamical Movement Primitives (DMP) and Recursive Least Square (RLS) for generating tactile skills. An autonomy allocation strategy, guided by the learning confidence and operator intention, ensures a smooth transition between human demonstration to autonomous robot operation. Our experimental results with two 7 Degree of Freedom (DoF) Franka Panda robot demonstrates that the tele-teaching framework facilitates online motion and force learning and adaptation within a few iterations.", "keywords": "Learning from Demonstration;Online Adaptation;Tactile Skill;Teleoperation;Autonomy Allocation", "primary_area": "", "supplementary_material": "/attachment/66d790f4e06d2b6947747476b7858e57d91ed087.zip", "author": "Xiao Chen;Tianle Ni;K\u00fcbra Karacan;Hamid Sadeghian;Sami Haddadin", "authorids": "~Xiao_Chen14;~Tianle_Ni1;~K\u00fcbra_Karacan1;hamid.sadeghian@tum.de;~Sami_Haddadin1", "gender": "F;;F;;", "homepage": ";;https://www.ce.cit.tum.de/en/rsi/team/karacan-kuebra/;;", "dblp": ";;;;", "google_scholar": "-7pr7B4AAAAJ;K9tsrN4AAAAJ;https://scholar.google.de/citations?user=iU-nHF8AAAAJ;;", "orcid": ";;;;", "linkedin": ";;kubrakaracan/;;", "or_profile": "~Xiao_Chen14;~Tianle_Ni1;~K\u00fcbra_Karacan1;hamid.sadeghian@tum.de;~Sami_Haddadin1", "aff": "Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;Technische Universit\u00e4t M\u00fcnchen;;", "aff_domain": "tum.de;tum.de;tum.edu;;", "position": "PhD student;MS student;PhD student;;", "bibtex": "@inproceedings{\nchen2024online,\ntitle={Online Transfer and Adaptation of Tactile Skill: A Teleoperation Framework},\nauthor={Xiao Chen and Tianle Ni and K{\\\"u}bra Karacan and Hamid Sadeghian and Sami Haddadin},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6X3ybeVpDi}\n}", "github": "", "project": "", "reviewers": "32qn;encR;4fok", "site": "https://openreview.net/forum?id=6X3ybeVpDi", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16707063251433079208&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen", "aff_unique_dep": "", "aff_unique_url": "https://www.tum.de", "aff_unique_abbr": "TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "6oESa4g05O", "title": "Distribution Discrepancy and Feature Heterogeneity for Active 3D Object Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "LiDAR-based 3D object detection is a critical technology for the development of autonomous driving and robotics. However, the high cost of data annotation limits its advancement. We propose a novel and effective active learning (AL) method called Distribution Discrepancy and Feature Heterogeneity (DDFH), which simultaneously considers geometric features and model embeddings, assessing information from both the instance-level and frame-level perspectives. Distribution Discrepancy evaluates the difference and novelty of instances within the unlabeled and labeled distributions, enabling the model to learn efficiently with limited data. Feature Heterogeneity ensures the heterogeneity of intra-frame instance features, maintaining feature diversity while avoiding redundant or similar instances, thus minimizing annotation costs. Finally, multiple indicators are efficiently aggregated using Quantile Transform, providing a unified measure of informativeness. Extensive experiments demonstrate that DDFH outperforms the current state-of-the-art (SOTA) methods on the KITTI and Waymo datasets, effectively reducing the bounding box annotation cost by 56.3% and showing robustness when working with both one-stage and two-stage models.", "keywords": "Active Learning;LiDAR 3D Object Detection;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/3826dcc6152f44835194290954c447d38e0deee0.zip", "author": "Huang-Yu Chen;Jia-Fong Yeh;Jiawei;Pin-Hsuan Peng;Winston H. Hsu", "authorids": "~Huang-Yu_Chen1;~Jia-Fong_Yeh1;~Jiawei1;~Pin-Hsuan_Peng1;~Winston_H._Hsu2", "gender": "M;;M;F;M", "homepage": ";https://www.cmlab.csie.ntu.edu.tw/~jiafongyeh/;https://jwliao1209.github.io/;;https://winstonhsu.info/", "dblp": ";198/7831;;;16/5668.html", "google_scholar": ";kS-oZ20AAAAJ;https://scholar.google.com.tw/citations?user=Lbr3sNMAAAAJ;https://scholar.google.com.tw/citations?hl=zh-TW;https://scholar.google.com.tw/citations?user=NOvDH3QAAAAJ", "orcid": ";;;0000-0003-4182-7015;0000-0002-3330-0638", "linkedin": "huang-yu-chen-02b4101b5/;;jwliao1209/;pin-hsuan/;", "or_profile": "~Huang-Yu_Chen1;~Jia-Fong_Yeh1;~Jiawei1;~Pin-Hsuan_Peng1;~Winston_Hsu1", "aff": "National Taiwan University;Sony Group Corporation;National Taiwan University;National Taiwan University;National Taiwan University", "aff_domain": "ntu.edu.tw;sony.com;ntu.edu.tw;ntu.edu.tw;ntu.edu.tw", "position": "MS student;Intern;PhD student;MS student;Professor", "bibtex": "@inproceedings{\nchen2024distribution,\ntitle={Distribution Discrepancy and Feature Heterogeneity for Active 3D Object Detection},\nauthor={Huang-Yu Chen and Jia-Fong Yeh and Jiawei and Pin-Hsuan Peng and Winston H. Hsu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=6oESa4g05O}\n}", "github": "https://github.com/Coolshanlan/DDFH-active-3Ddet", "project": "", "reviewers": "Ybjb;x4Ww;jH4f", "site": "https://openreview.net/forum?id=6oESa4g05O", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Z_N4CagdXswJ:scholar.google.com/&scioq=Distribution+Discrepancy+and+Feature+Heterogeneity+for+Active+3D+Object+Detection&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "National Taiwan University;Sony Group Corporation", "aff_unique_dep": ";", "aff_unique_url": "https://www.ntu.edu.tw;https://www.sony.com", "aff_unique_abbr": "NTU;Sony", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "China;Japan" }, { "id": "7E3JAys1xO", "title": "D$^3$RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Depth sensing is an important problem for 3D vision-based robotics. Yet, a real-world active stereo or ToF depth camera often produces noisy and incomplete depth which bottlenecks robot performances. In this work, we propose D3RoMa, a learning-based depth estimation framework on stereo image pairs that predicts clean and accurate depth in diverse indoor scenes, even in the most challenging scenarios with translucent or specular surfaces where classical depth sensing completely fails. Key to our method is that we unify depth estimation and restoration into an image-to-image translation problem by predicting the disparity map with a denoising diffusion probabilistic model. At inference time, we further incorporated a left-right consistency constraint as classifier guidance to the diffusion process. Our framework combines recently advanced learning-based approaches and geometric constraints from traditional stereo vision. For model training, we create a large scene-level synthetic dataset with diverse transparent and specular objects to compensate for existing tabletop datasets. The trained model can be directly applied to real-world in-the-wild scenes and achieve state-of-the-art performance in multiple public depth estimation benchmarks. Further experiments in both simulated and real environments show that accurate depth prediction significantly improves robotic manipulation in various scenarios.", "keywords": "Depth Estimation;Diffusion Model;Stereo Vision", "primary_area": "", "supplementary_material": "/attachment/123ec883a3bd1018e1ee7efc7d7e55a34bbfd280.zip", "author": "Songlin Wei;Haoran Geng;Jiayi Chen;Congyue Deng;Cui Wenbo;Chengyang Zhao;Xiaomeng Fang;Leonidas Guibas;He Wang", "authorids": "~Songlin_Wei1;~Haoran_Geng1;~Jiayi_Chen5;~Congyue_Deng1;~Cui_Wenbo1;~Chengyang_Zhao1;~Xiaomeng_Fang1;~Leonidas_Guibas1;~He_Wang5", "gender": "M;M;F;M;M;;M;M;M", "homepage": "http://wei.songl.in;https://github.com/geng-haoran/haorangeng;https://cs.stanford.edu/~congyue/;;https://chengyzhao.github.io;;http://geometry.stanford.edu/;https://hughw19.github.io;https://github.com/JYChen18", "dblp": ";295/7112;267/5521;;237/5995;;g/LeonidasJGuibas;01/6368-10;42/1159-3", "google_scholar": "jmtAxTgAAAAJ;Inr-6rEAAAAJ;XJZ8UBcAAAAJ;https://scholar.google.com.hk/citations?user=IHB_8okAAAAJ;XIFrv2cAAAAJ;;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ;roCAWkoAAAAJ;", "orcid": "0000-0002-1487-1494;;;;;;;;", "linkedin": ";haoran-geng-422778238/;;;;fangxiaomeng;;;", "or_profile": "~Songlin_Wei1;~Haoran_Geng1;~Congyue_Deng1;~Cui_Wenbo1;~Chengyang_Zhao1;~Xiaomeng_Fang1;~Leonidas_Guibas1;~He_Wang5;~jiayi_chen3", "aff": "Peking University;Peking University;Stanford University;University of Chinese Academy of Sciences;Peking University;;Stanford University;Peking University;Peking University", "aff_domain": "stu.pku.edu.cn;pku.edu.cn;stanford.edu;ucas.edu;pku.edu.cn;;stanford.edu;pku.edu.cn;pku.edu.cn", "position": "PhD student;Undergrad student;PhD student;PhD student;Undergrad student;;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nwei2024droma,\ntitle={D\\${\\textasciicircum}3\\$RoMa: Disparity Diffusion-based Depth Sensing for Material-Agnostic Robotic Manipulation},\nauthor={Songlin Wei and Haoran Geng and Jiayi Chen and Congyue Deng and Cui Wenbo and Chengyang Zhao and Xiaomeng Fang and Leonidas Guibas and He Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7E3JAys1xO}\n}", "github": "", "project": "", "reviewers": "Jtaw;jawu;SQrs", "site": "https://openreview.net/forum?id=7E3JAys1xO", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1105521645784772222&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0;1;0;0", "aff_unique_norm": "Peking University;Stanford University;University of Chinese Academy of Sciences", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;http://www.ucas.ac.cn", "aff_unique_abbr": "Peking U;Stanford;UCAS", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;1;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "7c5rAY8oU3", "title": "Automated Creation of Digital Cousins for Robust Policy Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Training robot policies in the real world can be unsafe, costly, and difficult to scale. Simulation serves as an inexpensive and potentially limitless source of training data, but suffers from the semantics and physics disparity between simulated and real-world environments. These discrepancies can be minimized by training in *digital twins*, which serve as virtual replicas of a real scene but are expensive to generate and cannot produce cross-domain generalization. To address these limitations, we propose the concept of ***digital cousins***, a virtual asset or scene that, unlike a *digital twin*, does not explicitly model a real-world counterpart but still exhibits similar geometric and semantic affordances. As a result, *digital cousins* simultaneously reduce the cost of generating an analogous virtual environment while also facilitating better robustness during sim-to-real domain transfer by providing a distribution of similar training scenes. Leveraging digital cousins, we introduce a novel method for their automated creation, and propose a fully automated real-to-sim-to-real pipeline for generating fully interactive scenes and training robot policies that can be deployed zero-shot in the original scene. We find that digital cousin scenes that preserve geometric and semantic affordances can be produced automatically, and can be used to train policies that outperform policies trained on digital twins, achieving 90\\% vs. 25\\% success rates under zero-shot sim-to-real transfer. Additional details are available at https://digital-cousins.github.io/.", "keywords": "Real-to-Sim; Digital Twin; Sim-to-Real Transfer", "primary_area": "", "supplementary_material": "/attachment/14f10d5e363aa736b73dd19e9c6dc1ce568bb8a4.zip", "author": "Tianyuan Dai;Josiah Wong;Yunfan Jiang;Chen Wang;Cem Gokmen;Ruohan Zhang;Jiajun Wu;Li Fei-Fei", "authorids": "tydai@stanford.edu;~Josiah_Wong1;~Yunfan_Jiang1;~Chen_Wang16;~Cem_Gokmen1;~Ruohan_Zhang1;~Jiajun_Wu1;~Li_Fei-Fei1", "gender": ";M;M;M;M;M;M;F", "homepage": ";https://www.jdw.ong;https://yunfanj.com/;http://www.chenwangjeremy.net/;https://www.cemgokmen.com;https://ai.stanford.edu/~zharu/;https://jiajunwu.com;https://profiles.stanford.edu/fei-fei-li", "dblp": ";178/8895;311/5581-1;;220/3187;;117/4768;79/2528", "google_scholar": ";Y0a0n5wAAAAJ;https://scholar.google.com/citations?hl=en;lStkAzsAAAAJ;wCiI8oUAAAAJ;-bqvNWoAAAAJ;2efgcS0AAAAJ;rDfyQnIAAAAJ", "orcid": ";;;;0000-0001-9446-6052;;0000-0002-4176-343X;", "linkedin": ";josiahw/;;;cgokmen/;;jiajunwu/;fei-fei-li-4541247/", "or_profile": "tydai@stanford.edu;~Josiah_Wong1;~Yunfan_Jiang1;~Chen_Wang16;~Cem_Gokmen1;~Ruohan_Zhang1;~Jiajun_Wu1;~Li_Fei-Fei1", "aff": ";Stanford University;Stanford University;Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": ";stanford.edu;cs.stanford.edu;cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": ";MS student;PhD student;PhD student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\ndai2024automated,\ntitle={Automated Creation of Digital Cousins for Robust Policy Learning},\nauthor={Tianyuan Dai and Josiah Wong and Yunfan Jiang and Chen Wang and Cem Gokmen and Ruohan Zhang and Jiajun Wu and Li Fei-Fei},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7c5rAY8oU3}\n}", "github": "https://github.com/cremebrule/digital-cousins", "project": "", "reviewers": "hB4w;wrjg;oRdT", "site": "https://openreview.net/forum?id=7c5rAY8oU3", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5921923284014211356&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7ddT4eklmQ", "title": "ACE: A Cross-platform and visual-Exoskeletons System for Low-Cost Dexterous Teleoperation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Bimanual robotic manipulation with dexterous hands has a large potential workability and a wide workspace as it follows the most natural human workflow.\nLearning from human demonstrations has proven highly effective for learning a dexterous manipulation policy. To collect such data, teleoperation serves as a straightforward and efficient way to do so.\nHowever, a cost-effective and easy-to-use teleoperation system is lacking for anthropomorphic robot hands.\nTo fill the deficiency, we developed \\our, a cross-platform visual-exoskeleton system for low-cost dexterous teleoperation. \nOur system employs a hand-facing camera to capture 3D hand poses and an exoskeleton mounted on a base that can be easily carried on users' backs. ACE captures both the hand root end-effector and hand pose in real-time and enables cross-platform operations. \nWe evaluate the key system parameters compared with previous teleoperation systems and show clear advantages of \\our.\nWe then showcase the desktop and mobile versions of our system on six different robot platforms (including humanoid-hands, arm-hands, arm-gripper, and quadruped-gripper systems), and demonstrate the effectiveness of learning three difficult real-world tasks through the collected demonstration on two of them.", "keywords": "Teleopration System; Hardware; Imitation Learning; Robot Learning; Exoskeletons", "primary_area": "", "supplementary_material": "/attachment/f8a80d05a3e6c9b401f3dd54691beb2a8764fb42.zip", "author": "Shiqi Yang;Minghuan Liu;Yuzhe Qin;Runyu Ding;Jialong Li;Xuxin Cheng;Ruihan Yang;Sha Yi;Xiaolong Wang", "authorids": "~Shiqi_Yang2;~Minghuan_Liu1;~Yuzhe_Qin1;~Runyu_Ding1;~Jialong_Li3;~Xuxin_Cheng2;~Ruihan_Yang2;~Sha_Yi1;~Xiaolong_Wang3", "gender": "M;M;M;F;M;M;M;F;M", "homepage": "https://aaronyang1223.github.io/;http://minghuanliu.com;https://yzqin.github.io/;https://dingry.github.io/;https://rexskywalkerlee.github.io/;https://chengxuxin.github.io;http://rchalyang.github.io/;https://yswhynot.github.io;https://xiaolonw.github.io/", "dblp": ";249/7554;241/9337;289/1652;;;;;91/952-4", "google_scholar": "OQQzJb4AAAAJ;;3KF3AIMAAAAJ;https://scholar.google.com.hk/citations?view_op=list_works;;Z8vhOxYAAAAJ;b-o1o7cAAAAJ;;Y8O9N_0AAAAJ", "orcid": "0009-0009-8529-4522;;0000-0002-9321-9305;;;;;;", "linkedin": ";;;;jialong-li-737a561a8/;;;;", "or_profile": "~Shiqi_Yang2;~Minghuan_Liu1;~Yuzhe_Qin1;~Runyu_Ding1;~Jialong_Li3;~Xuxin_Cheng2;~Ruihan_Yang2;~Sha_Yi1;~Xiaolong_Wang3", "aff": "University of California, San Diego;Shanghai Jiaotong University;University of California, San Diego;Electrical and Electronic Engineering, University of Hong Kong;University of California, San Diego;University of California, San Diego;University of California, San Diego;Carnegie Mellon University;University of California, San Diego", "aff_domain": "ucsd.edu;sjtu.edu.cn;ucsd.edu;eee.hku.hk;ucsd.edu;ucsd.edu;ucsd.edu;cmu.edu;ucsd.edu", "position": "MS student;PhD student;PhD student;PhD student;MS student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyang2024ace,\ntitle={{ACE}: A Cross-platform and visual-Exoskeletons System for Low-Cost Dexterous Teleoperation},\nauthor={Shiqi Yang and Minghuan Liu and Yuzhe Qin and Runyu Ding and Jialong Li and Xuxin Cheng and Ruihan Yang and Sha Yi and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7ddT4eklmQ}\n}", "github": "https://github.com/ACETeleop/ACETeleop", "project": "", "reviewers": "14Vt;yoG4;BGSU", "site": "https://openreview.net/forum?id=7ddT4eklmQ", "pdf_size": 0, "rating": "1;2;3", "confidence": "4;3;4", "rating_avg": 2.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 37, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6038255640108992027&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;0;0;0;3;0", "aff_unique_norm": "University of California, San Diego;Shanghai Jiao Tong University;University of Hong Kong;Carnegie Mellon University", "aff_unique_dep": ";;Electrical and Electronic Engineering;", "aff_unique_url": "https://www.ucsd.edu;https://www.sjtu.edu.cn;https://www.hku.hk;https://www.cmu.edu", "aff_unique_abbr": "UCSD;SJTU;HKU;CMU", "aff_campus_unique_index": "0;0;2;0;0;0;0", "aff_campus_unique": "San Diego;;Hong Kong SAR", "aff_country_unique_index": "0;1;0;1;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "7vzDBvviRO", "title": "UBSoft: A Simulation Platform for Robotic Skill Learning in Unbounded Soft Environments", "track": "main", "status": "Poster", "tldr": "", "abstract": "It is desired to equip robots with the capability of interacting with various soft materials as they are ubiquitous in the real world. While physics simulations are one of the predominant methods for data collection and robot training, simulating soft materials presents considerable challenges. Specifically, it is significantly more costly than simulating rigid objects in terms of simulation speed and storage requirements. These limitations typically restrict the scope of studies on soft materials to small and bounded areas, thereby hindering the learning of skills in broader spaces. To address this issue, we introduce UBSoft, a new simulation platform designed to support unbounded soft environments for robot skill acquisition. Our platform utilizes spatially adaptive resolution scales, where simulation resolution dynamically adjusts based on proximity to active robotic agents. Our framework markedly reduces the demand for extensive storage space and computation costs required for large-scale scenarios involving soft materials. We also establish a set of benchmark tasks in our platform, including both locomotion and manipulation tasks, and conduct experiments to evaluate the efficacy of various reinforcement learning algorithms and trajectory optimization techniques, both gradient-based and sampling-based. Preliminary results indicate that sampling-based trajectory optimization generally achieves better results for obtaining one trajectory to solve the task. Additionally, we conduct experiments in real-world environments to demonstrate that advancements made in our UBSoft simulator could translate to improved robot interactions with large-scale soft material. More videos can be found at https://ubsoft24.github.io.", "keywords": "Soft-Body Manipulation;Locomotion;Physics Simulation", "primary_area": "", "supplementary_material": "/attachment/a47e08ede1fb53864532e5b3e10f0b23b4cf4ce2.zip", "author": "Chunru Lin;Jugang Fan;Yian Wang;Zeyuan Yang;Zhehuan Chen;Lixing Fang;Tsun-Hsuan Wang;Zhou Xian;Chuang Gan", "authorids": "~Chunru_Lin1;~Jugang_Fan1;~Yian_Wang1;~Zeyuan_Yang3;~Zhehuan_Chen1;~Lixing_Fang1;~Tsun-Hsuan_Wang2;~Zhou_Xian1;~Chuang_Gan1", "gender": "F;;M;M;M;M;M;M;M", "homepage": "https://xhrlyb.github.io;https://github.com/felixfjg;https://miicheyang.github.io/;https://www.cnblogs.com/ACMLCZH;https://owenowl.github.io/;https://zswang666.github.io/;;http://people.csail.mit.edu/ganchuang/;http://wangyian-me.github.io/", "dblp": "324/5212;;260/6331-2.html;;;217/1809.html;258/5020;139/6993;71/10046", "google_scholar": "PTYVWdIAAAAJ;;k_qpTh4AAAAJ;LvNUzlEAAAAJ;;xE3WSuYAAAAJ;;PTeSCbIAAAAJ;dUf3wx4AAAAJ", "orcid": ";;;;;;;;", "linkedin": ";;;;;;;;", "or_profile": "~Chunru_Lin1;~Jugang_Fan1;~Zeyuan_Yang3;~Zhehuan_Chen1;~Lixing_Fang1;~Tsun-Hsuan_Wang2;~Zhou_Xian1;~Chuang_Gan1;~\u9038\u5b89_\u738b1", "aff": "University of Massachusetts at Amherst;South China University of Technology;, Tsinghua University;University of Massachusetts at Amherst;Tsinghua University;Liquid AI;Carnegie Mellon University;University of Massachusetts at Amherst;NVIDIA", "aff_domain": "umass.edu;scut.edu.cn;cs.tsinghua.edu.cn;umass.edu;tsinghua.edu.cn;liquid.ai;cmu.edu;umass.edu;nvidia.com", "position": "PhD student;MS student;MS student;MS student;Undergrad student;Researcher;PhD student;Assistant Professor;Intern", "bibtex": "@inproceedings{\nlin2024ubsoft,\ntitle={{UBS}oft: A Simulation Platform for Robotic Skill Learning in Unbounded Soft Environments},\nauthor={Chunru Lin and Jugang Fan and Yian Wang and Zeyuan Yang and Zhehuan Chen and Lixing Fang and Tsun-Hsuan Wang and Zhou Xian and Chuang Gan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7vzDBvviRO}\n}", "github": "", "project": "", "reviewers": "2B5z;P9RJ;gbiB", "site": "https://openreview.net/forum?id=7vzDBvviRO", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;3", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5041910731503732799&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;2;3;4;0;5", "aff_unique_norm": "University of Massachusetts Amherst;South China University of Technology;Tsinghua University;Liquid AI;Carnegie Mellon University;NVIDIA", "aff_unique_dep": ";;;;;NVIDIA Corporation", "aff_unique_url": "https://www.umass.edu;https://www.scut.edu.cn;https://www.tsinghua.edu.cn;;https://www.cmu.edu;https://www.nvidia.com", "aff_unique_abbr": "UMass Amherst;SCUT;THU;;CMU;NVIDIA", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Amherst;", "aff_country_unique_index": "0;1;1;0;1;2;0;0;0", "aff_country_unique": "United States;China;Unknown" }, { "id": "7wMlwhCvjS", "title": "GenDP: 3D Semantic Fields for Category-Level Generalizable Diffusion Policy", "track": "main", "status": "Poster", "tldr": "", "abstract": "Diffusion-based policies have shown remarkable capability in executing complex robotic manipulation tasks but lack explicit characterization of geometry and semantics, which often limits their ability to generalize to unseen objects and layouts. To enhance the generalization capabilities of Diffusion Policy, we introduce a novel framework that incorporates explicit spatial and semantic information via 3D semantic fields. We generate 3D descriptor fields from multi-view RGBD observations with large foundational vision models, then compare these descriptor fields against reference descriptors to obtain semantic fields. The proposed method explicitly considers geometry and semantics, enabling strong generalization capabilities in tasks requiring category-level generalization, resolving geometric ambiguities, and attention to subtle geometric details. We evaluate our method across eight tasks involving articulated objects and instances with varying shapes and textures from multiple object categories. Our method demonstrates its effectiveness by increasing Diffusion Policy's average success rate on \\textit{unseen} instances from 20\\% to 93\\%. Additionally, we provide a detailed analysis and visualization to interpret the sources of performance gain and explain how our method can generalize to novel instances. Project page: https://robopil.github.io/GenDP/", "keywords": "Semantic Fields;Category-Level Generalization;Imitation Learning;Diffusion Models", "primary_area": "", "supplementary_material": "/attachment/ee856a511a4fe4b2f0a9378a2d83b3d6a21c9865.zip", "author": "Yixuan Wang;Guang Yin;Binghao Huang;Tarik Kelestemur;Jiuguang Wang;Yunzhu Li", "authorids": "~Yixuan_Wang2;~Guang_Yin1;~Binghao_Huang1;~Tarik_Kelestemur1;~Jiuguang_Wang1;~Yunzhu_Li1", "gender": "M;M;;;;M", "homepage": "https://wangyixuan12.github.io/;https://github.com/AlchemicRonin;https://binghao-huang.github.io/;https://kelestemur.com/;;https://yunzhuli.github.io/", "dblp": "44/4317-3;;;;;182/1831", "google_scholar": "https://scholar.google.com/citations?hl=en;;nqoOetAAAAAJ;;;WlA92lcAAAAJ", "orcid": "0009-0006-6641-4718;;;;;", "linkedin": "yixuan-wang-54298115a;;;;;", "or_profile": "~Yixuan_Wang2;~Guang_Yin1;~Binghao_Huang1;~Tarik_Kelestemur1;~Jiuguang_Wang1;~Yunzhu_Li1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign;Boston Dynamics AI Institute;;University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;uiuc.edu;illinois.edu;theaiinstitute.com;;illinois.edu", "position": "MS student;MS student;PhD student;Researcher;;Assistant Professor", "bibtex": "@inproceedings{\nwang2024gendp,\ntitle={Gen{DP}: 3D Semantic Fields for Category-Level Generalizable Diffusion Policy},\nauthor={Yixuan Wang and Guang Yin and Binghao Huang and Tarik Kelestemur and Jiuguang Wang and Yunzhu Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7wMlwhCvjS}\n}", "github": "https://github.com/WangYixuan12/gendp", "project": "", "reviewers": "WbYD;ruvW;m8cy", "site": "https://openreview.net/forum?id=7wMlwhCvjS", "pdf_size": 0, "rating": "1;3;3", "confidence": "4;5;4", "rating_avg": 2.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15960661270735128186&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Boston Dynamics AI Institute", "aff_unique_dep": ";AI Institute", "aff_unique_url": "https://illinois.edu;https://www.bostondynamics.com/", "aff_unique_abbr": "UIUC;BD AI", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "7yMZAUkXa4", "title": "MimicTouch: Leveraging Multi-modal Human Tactile Demonstrations for Contact-rich Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Tactile sensing is critical to fine-grained, contact-rich manipulation tasks, such as insertion and assembly. Prior research has shown the possibility of learning tactile-guided policy from teleoperated demonstration data. However, to provide the demonstration, human users often rely on visual feedback to control the robot. This creates a gap between the sensing modality used for controlling the robot (visual) and the modality of interest (tactile). To bridge this gap, we introduce \"MimicTouch'', a novel framework for learning policies directly from demonstrations provided by human users with their hands. The key innovations are i) a human tactile data collection system which collects multi-modal tactile dataset for learning human's tactile-guided control strategy, ii) an imitation learning-based framework for learning human's tactile-guided control strategy through such data, and iii) an online residual RL framework to bridge the embodiment gap between the human hand and the robot gripper. Through comprehensive experiments, we highlight the efficacy of utilizing human's tactile-guided control strategy to resolve contact-rich manipulation tasks. The project website is at https://sites.google.com/view/MimicTouch.", "keywords": "Tactile Sensing;Learning from Human;Data Collection;Imitation Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/d8f5c3f27d856704eb4f2529bd7b8bc6ebec401a.zip", "author": "Kelin Yu;Yunhai Han;Qixian Wang;Vaibhav Saxena;Danfei Xu;Ye Zhao", "authorids": "~Kelin_Yu1;~Yunhai_Han1;qxwang_m@zju.edu.cn;~Vaibhav_Saxena1;~Danfei_Xu1;~Ye_Zhao2", "gender": "M;M;;M;M;M", "homepage": "https://colinyu1.github.io/ColinYu1.github.io/;https://y8han.github.io/;;https://sites.google.com/view/vaibhavsaxena;https://cs.stanford.edu/~danfei/;http://lab-idar.gatech.edu/", "dblp": ";276/6126;;90/5273;135/8443;", "google_scholar": "zVdZJRwAAAAJ;lsN3nY8AAAAJ;;J9xMyxMAAAAJ;J5D4kcoAAAAJ;", "orcid": ";;;;;", "linkedin": "kelin-yu-3655a91b5/;;;vaibhavsaxena11/;;", "or_profile": "~Kelin_Yu1;~Yunhai_Han1;qxwang_m@zju.edu.cn;~Vaibhav_Saxena1;~Danfei_Xu1;~Ye_Zhao2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;;Georgia Institute of Technology;NVIDIA;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;;gatech.edu;nvidia.com;gatech.edu", "position": "MS student;PhD student;;PhD student;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nyu2024mimictouch,\ntitle={MimicTouch: Leveraging Multi-modal Human Tactile Demonstrations for Contact-rich Manipulation},\nauthor={Kelin Yu and Yunhai Han and Qixian Wang and Vaibhav Saxena and Danfei Xu and Ye Zhao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=7yMZAUkXa4}\n}", "github": "", "project": "", "reviewers": "VHmX;4Bsg;W9Dy", "site": "https://openreview.net/forum?id=7yMZAUkXa4", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2218640223214388567&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "82bpTugrMt", "title": "Monocular Event-Based Vision for Obstacle Avoidance with a Quadrotor", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present the first static-obstacle avoidance method for quadrotors using just an onboard, monocular event camera. Quadrotors are capable of fast and agile flight in cluttered environments when piloted manually, but vision-based autonomous flight in unknown environments is difficult in part due to the sensor limitations of traditional onboard cameras. Event cameras, however, promise nearly zero motion blur and high dynamic range, but produce a very large volume of events under significant ego-motion and further lack a continuous-time sensor model in simulation, making direct sim-to-real transfer not possible. By leveraging depth prediction as a pretext task in our learning framework, we can pre-train a reactive obstacle avoidance events-to-control policy with approximated, simulated events and then fine-tune the perception component with limited events-and-depth real-world data to achieve obstacle avoidance in indoor and outdoor settings. We demonstrate this across two quadrotor-event camera platforms in multiple settings and find, contrary to traditional vision-based works, that low speeds (1m/s) make the task harder and more prone to collisions, while high speeds (5m/s) result in better event-based depth estimation and avoidance. We also find that success rates in outdoor scenes can be significantly higher than in certain indoor scenes.", "keywords": "event-based vision;learning for control;simulation-to-real transfer;aerial robotics", "primary_area": "", "supplementary_material": "/attachment/ace21a8a0881fe48df43cf90e3a69cb67d99abd9.zip", "author": "Anish Bhattacharya;Marco Cannici;Nishanth Rao;Yuezhan Tao;Vijay Kumar;Nikolai Matni;Davide Scaramuzza", "authorids": "~Anish_Bhattacharya1;~Marco_Cannici1;~Nishanth_Rao1;~Yuezhan_Tao1;~Vijay_Kumar2;~Nikolai_Matni1;~Davide_Scaramuzza1", "gender": "M;;;;;M;", "homepage": "https://www.anishbhattacharya.com;;;;http://kumarrobotics.org;https://nikolaimatni.github.io;", "dblp": ";220/3899;;;;52/8135;", "google_scholar": "L3XqyLIAAAAJ;https://scholar.google.it/citations?user=Xd9geyMAAAAJ;;7IwA14gAAAAJ;FUOEBDUAAAAJ;ZDPCh_EAAAAJ;", "orcid": "0000-0002-5961-5486;;;;;;", "linkedin": "anish-bhattacharya/;;;;;;", "or_profile": "~Anish_Bhattacharya1;~Marco_Cannici1;~Nishanth_Rao1;~Yuezhan_Tao1;~Vijay_Kumar2;~Nikolai_Matni1;~Davide_Scaramuzza1", "aff": "University of Pennsylvania;University of Zurich;;University of Pennsylvania;University of Pennsylvania;School of Engineering and Applied Science, University of Pennsylvania;", "aff_domain": "upenn.edu;ifi.uzh.ch;;seas.upenn.edu;upenn.edu;seas.upenn.edu;", "position": "PhD student;Postdoc;;PhD student;Full Professor;Assistant Professor;", "bibtex": "@inproceedings{\nbhattacharya2024monocular,\ntitle={Monocular Event-Based Vision for Obstacle Avoidance with a Quadrotor},\nauthor={Anish Bhattacharya and Marco Cannici and Nishanth Rao and Yuezhan Tao and Vijay Kumar and Nikolai Matni and Davide Scaramuzza},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=82bpTugrMt}\n}", "github": "", "project": "", "reviewers": "tqUw;gGzy;n5tQ", "site": "https://openreview.net/forum?id=82bpTugrMt", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2681647363588669201&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "University of Pennsylvania;University of Zurich", "aff_unique_dep": ";", "aff_unique_url": "https://www.upenn.edu;https://www.unizh.ch", "aff_unique_abbr": "UPenn;UZH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;0;0", "aff_country_unique": "United States;Switzerland" }, { "id": "8Ar8b00GJC", "title": "Autonomous Improvement of Instruction Following Skills via Foundation Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Intelligent robots capable of improving from autonomously collected experience have the potential to transform robot learning: instead of collecting costly teleoperated demonstration data, large-scale deployment of fleets of robots can quickly collect larger quantities of autonomous data useful for training better robot policies. However, autonomous improvement requires solving two key problems: (i) fully automating a scalable data collection procedure that can collect diverse and semantically meaningful robot data and (ii) learning from non-optimal, autonomous data with no human annotations. To this end, we propose a novel approach that addresses these challenges, allowing instruction following policies to improve from autonomously collected data without human supervision. Our framework leverages vision-language models to collect and evaluate semantically meaningful experiences in new environments, and then utilizes a decomposition of instruction following tasks into (semantic) language-conditioned image generation and (non-semantic) goal reaching, which makes it significantly more practical to improve from this autonomously collected data without any human annotations. We carry out extensive experiments in the real world to demonstrate the effectiveness of our approach, and find that in a suite of unseen environments, the robot policy can be improved significantly with autonomously collected data. We open-source the code for our semantic autonomous improvement pipeline, as well as our autonomous dataset of 25K trajectories collected across five tabletop environments: https://soar-autonomous-improvement.github.io", "keywords": "Autonomous Improvement;Instruction Following Skills;Scaled Data Collection", "primary_area": "", "supplementary_material": "/attachment/12b42a82f850f6c9043c871aa437bf01b347cfc6.zip", "author": "Zhiyuan Zhou;Pranav Atreya;Abraham Lee;Homer Rich Walke;Oier Mees;Sergey Levine", "authorids": "~Zhiyuan_Zhou2;~Pranav_Atreya1;~Abraham_Lee2;~Homer_Rich_Walke1;~Oier_Mees1;~Sergey_Levine1", "gender": "M;;M;M;M;M", "homepage": "https://zhouzypaul.github.io;https://pranavatreya.github.io;;https://homerwalke.com;https://www.oiermees.com/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";317/4655;;279/6795;190/8659;80/7594", "google_scholar": "unQVOJkAAAAJ;bQowYEYAAAAJ;;ZWH5jCwAAAAJ;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;8R35rCwAAAAJ", "orcid": ";;;;;", "linkedin": "zhiyuan-paul-zhou/;pranav-d-atreya;abraham-lee-4a0497242/;;oier-mees-a3069488;", "or_profile": "~Zhiyuan_Zhou2;~Pranav_Atreya1;~Abraham_Lee2;~Homer_Rich_Walke1;~Oier_Mees1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;eecs.berkeley.edu;google.com", "position": "PhD student;PhD student;Undergrad student;PhD student;Postdoc;Research Scientist", "bibtex": "@inproceedings{\nzhou2024autonomous,\ntitle={Autonomous Improvement of Instruction Following Skills via Foundation Models},\nauthor={Zhiyuan Zhou and Pranav Atreya and Abraham Lee and Homer Rich Walke and Oier Mees and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8Ar8b00GJC}\n}", "github": "https://github.com/rail-berkeley/soar", "project": "", "reviewers": "vqQn;zNvs;rBFY", "site": "https://openreview.net/forum?id=8Ar8b00GJC", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14137211475790478024&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8JLmTZsxGh", "title": "Learning Performance-oriented Control Barrier Functions Under Complex Safety Constraints and Limited Actuation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Control Barrier Functions (CBFs) offer an elegant framework for constraining nonlinear control system dynamics to an invariant subset of a pre-specified safe set. However, finding a CBF that simultaneously promotes performance by maximizing the resulting control invariant set while accommodating complex safety constraints, especially in high relative degree systems with actuation constraints, remains a significant challenge. In this work, we propose a novel self-supervised learning framework that holistically addresses these hurdles. Given a Boolean composition of multiple state constraints defining the safe set, our approach begins by constructing a smooth function whose zero superlevel set provides an inner approximation of the safe set. This function is then used with a smooth neural network to parameterize the CBF candidate. Finally, we design a physics-informed training loss function based on a Hamilton-Jacobi Partial Differential Equation (PDE) to train the PINN-CBF and enlarge the volume of the induced control invariant set. We demonstrate the effectiveness of our approach on a 2D double integrator (DI) system and a 7D fixed-wing aircraft system (F16).", "keywords": "Control Barrier Functions;Safety;Hamilton-Jacobi Partial Differential Equation", "primary_area": "", "supplementary_material": "/attachment/28b1bd8ea7a8ac0059a60620330c9a154b318569.zip", "author": "Lakshmideepakreddy Manda;Shaoru Chen;Mahyar Fazlyab", "authorids": "~Lakshmideepakreddy_Manda2;~Shaoru_Chen1;~Mahyar_Fazlyab1", "gender": "M;M;M", "homepage": ";https://www.shaoru.site/;https://www.ece.jhu.edu/mahyarfazlyab/", "dblp": ";254/9531;147/4846", "google_scholar": ";PUIfJYcAAAAJ;Y3bmjJwAAAAJ", "orcid": ";;", "linkedin": "deepak-manda-429846274;;", "or_profile": "~Lakshmideepakreddy_Manda2;~Shaoru_Chen1;~Mahyar_Fazlyab1", "aff": "Whiting School of Engineering;Microsoft Research;Johns Hopkins University", "aff_domain": "engineering.jhu.edu;microsoft.com;jhu.edu", "position": "PhD student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nmanda2024learning,\ntitle={Learning Performance-oriented Control Barrier Functions Under Complex Safety Constraints and Limited Actuation},\nauthor={Lakshmideepakreddy Manda and Shaoru Chen and Mahyar Fazlyab},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8JLmTZsxGh}\n}", "github": "https://github.com/o4lc/PINN-CBF", "project": "", "reviewers": "EPGh;6drv;LoGn", "site": "https://openreview.net/forum?id=8JLmTZsxGh", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15055293972985156510&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "Johns Hopkins University;Microsoft", "aff_unique_dep": "School of Engineering;Microsoft Research", "aff_unique_url": "https://engineering.jhu.edu;https://www.microsoft.com/en-us/research", "aff_unique_abbr": "JHU Engineering;MSR", "aff_campus_unique_index": "0", "aff_campus_unique": "Baltimore;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "8LPXeGhhbH", "title": "RAM: Retrieval-Based Affordance Transfer for Generalizable Zero-Shot Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "This work proposes a retrieve-and-transfer framework for zero-shot robotic manipulation, dubbed RAM, featuring generalizability across various objects, environments, and embodiments. Unlike existing approaches that learn manipulation from expensive in-domain demonstrations, RAM capitalizes on a retrieval-based affordance transfer paradigm to acquire versatile manipulation capabilities from abundant out-of-domain data. RAM first extracts unified affordance at scale from diverse sources of demonstrations including robotic data, human-object interaction (HOI) data, and custom data to construct a comprehensive affordance memory. Then given a language instruction, RAM hierarchically retrieves the most similar demonstration from the affordance memory and transfers such out-of-domain 2D affordance to in-domain 3D actionable affordance in a zero-shot and embodiment-agnostic manner. Extensive simulation and real-world evaluations demonstrate that our RAM consistently outperforms existing works in diverse daily tasks. Additionally, RAM shows significant potential for downstream applications such as automatic and efficient data collection, one-shot visual imitation, and LLM/VLM-integrated long-horizon manipulation.", "keywords": "Hierarchical Retrieval;Affordance Transfer;Zero-Shot Robotic Manipulation;Visual Foundation Models", "primary_area": "", "supplementary_material": "/attachment/54f23d46e3df993eca81e658a8b391b425dff4fb.zip", "author": "Yuxuan Kuang;Junjie Ye;Haoran Geng;Jiageng Mao;Congyue Deng;Leonidas Guibas;He Wang;Yue Wang", "authorids": "~Yuxuan_Kuang1;~Junjie_Ye3;~Haoran_Geng1;~Jiageng_Mao1;~Congyue_Deng1;~Leonidas_Guibas1;~He_Wang5;~Yue_Wang2", "gender": "M;M;M;;F;M;M;M", "homepage": ";https://jay-ye.github.io/;https://github.com/geng-haoran/haorangeng;;https://cs.stanford.edu/~congyue/;http://geometry.stanford.edu/;https://hughw19.github.io;https://yuewang.xyz", "dblp": "358/9222;19/8588;295/7112;;267/5521;g/LeonidasJGuibas;01/6368-10;33/4822-41", "google_scholar": ";aX7Fa7EAAAAJ;Inr-6rEAAAAJ;;XJZ8UBcAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ;roCAWkoAAAAJ;v-AEFIEAAAAJ", "orcid": ";0000-0002-4316-166X;;;;;;", "linkedin": ";;haoran-geng-422778238/;;;;;", "or_profile": "~Yuxuan_Kuang1;~Junjie_Ye3;~Haoran_Geng1;~Jiageng_Mao1;~Congyue_Deng1;~Leonidas_Guibas1;~He_Wang5;~Yue_Wang2", "aff": "Peking University;University of Southern California;Peking University;;Stanford University;Stanford University;Peking University;NVIDIA", "aff_domain": "pku.edu.cn;usc.edu;pku.edu.cn;;stanford.edu;stanford.edu;pku.edu.cn;nvidia.com", "position": "Undergrad student;PhD student;Undergrad student;;PhD student;Full Professor;Assistant Professor;Researcher", "bibtex": "@inproceedings{\nkuang2024ram,\ntitle={{RAM}: Retrieval-Based Affordance Transfer for Generalizable Zero-Shot Robotic Manipulation},\nauthor={Yuxuan Kuang and Junjie Ye and Haoran Geng and Jiageng Mao and Congyue Deng and Leonidas Guibas and He Wang and Yue Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8LPXeGhhbH}\n}", "github": "https://github.com/yxKryptonite/RAM_code", "project": "", "reviewers": "2wfH;38ey;Xxnf", "site": "https://openreview.net/forum?id=8LPXeGhhbH", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15288031761134857303&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;2;2;0;3", "aff_unique_norm": "Peking University;University of Southern California;Stanford University;NVIDIA", "aff_unique_dep": ";;;NVIDIA Corporation", "aff_unique_url": "http://www.pku.edu.cn;https://www.usc.edu;https://www.stanford.edu;https://www.nvidia.com", "aff_unique_abbr": "Peking U;USC;Stanford;NVIDIA", "aff_campus_unique_index": "1;2;2", "aff_campus_unique": ";Los Angeles;Stanford", "aff_country_unique_index": "0;1;0;1;1;0;1", "aff_country_unique": "China;United States" }, { "id": "8PcRynpd1m", "title": "Safe Bayesian Optimization for the Control of High-Dimensional Embodied Systems", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning to move is a primary goal for animals and robots, where ensuring safety is often important when optimizing control policies on the embodied systems. For complex tasks such as the control of human or humanoid control, the high-dimensional parameter space adds complexity to the safe optimization effort. Current safe exploration algorithms exhibit inefficiency and may even become infeasible with large high-dimensional input spaces. Furthermore, existing high-dimensional constrained optimization methods neglect safety in the search process. In this paper, we propose High-dimensional Safe Bayesian Optimization with local optimistic exploration (HdSafeBO), a novel approach designed to handle high-dimensional sampling problems under probabilistic safety constraints. We introduce a local optimistic strategy to efficiently and safely optimize the objective function, providing a probabilistic safety guarantee and a cumulative safety violation bound. Through the use of isometric embedding, HdSafeBO addresses problems ranging from a few hundred to several thousand dimensions while maintaining safety guarantees. To our knowledge, HdSafeBO is the first algorithm capable of optimizing the control of high-dimensional musculoskeletal systems with high safety probability. We also demonstrate the real-world applicability of HdSafeBO through its use in the safe online optimization of neural stimulation-induced human motion control.", "keywords": "Safe Bayesian Optimization;High-dimensional Embodied System", "primary_area": "", "supplementary_material": "/attachment/f0161e67763789e40f87ed44cf636daf7ad5850d.zip", "author": "Yunyue Wei;Zeji Yi;Hongda Li;Saraswati Soedarmadji;Yanan Sui", "authorids": "~Yunyue_Wei1;~Zeji_Yi1;~Hongda_Li2;~Saraswati_Soedarmadji1;~Yanan_Sui1", "gender": "M;M;M;;M", "homepage": "https://github.com/yunyuewei;https://neuralmachine.cc/members.html;https://hongdali.com;https://www.caltech.edu;https://www.yanansui.com", "dblp": ";;;;151/6934", "google_scholar": ";;;;https://scholar.google.com/citations?hl=en", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Yunyue_Wei1;~Zeji_Yi1;~Hongda_Li2;~Saraswati_Soedarmadji1;~Yanan_Sui1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;California Institute of Technology;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;caltech.edu;tsinghua.edu.cn", "position": "PhD student;MS student;PhD student;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nwei2024safe,\ntitle={Safe Bayesian Optimization for the Control of High-Dimensional Embodied Systems},\nauthor={Yunyue Wei and Zeji Yi and Hongda Li and Saraswati Soedarmadji and Yanan Sui},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8PcRynpd1m}\n}", "github": "", "project": "", "reviewers": "KPXx;kViv;7rTb", "site": "https://openreview.net/forum?id=8PcRynpd1m", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:iriRn8qt48UJ:scholar.google.com/&scioq=Safe+Bayesian+Optimization+for+the+Control+of+High-Dimensional+Embodied+Systems&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Tsinghua University;California Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.caltech.edu", "aff_unique_abbr": "THU;Caltech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pasadena", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United States" }, { "id": "8XFT1PatHy", "title": "Splat-MOVER: Multi-Stage, Open-Vocabulary Robotic Manipulation via Editable Gaussian Splatting", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present Splat-MOVER, a modular robotics stack for open-vocabulary\nrobotic manipulation, which leverages the editability of Gaussian Splatting (GSplat)\nscene representations to enable multi-stage manipulation tasks. Splat-MOVER\nconsists of: (i) ASK-Splat, a GSplat representation that distills semantic and grasp\naffordance features into the 3D scene. ASK-Splat enables geometric, semantic,\nand affordance understanding of 3D scenes, which is critical for many robotics\ntasks; (ii) SEE-Splat, a real-time scene-editing module using 3D semantic masking\nand infilling to visualize the motions of objects that result from robot interactions\nin the real-world. SEE-Splat creates a \u201cdigital twin\u201d of the evolving environment\nthroughout the manipulation task; and (iii) Grasp-Splat, a grasp generation module\nthat uses ASK-Splat and SEE-Splat to propose affordance-aligned candidate grasps\nfor open-world objects. ASK-Splat is trained in real-time from RGB images\nin a brief scanning phase prior to operation, while SEE-Splat and Grasp-Splat\nrun in real-time during operation. We demonstrate the superior performance of\nSplat-MOVER in hardware experiments on a Kinova robot compared to two recent\nbaselines in four single-stage, open-vocabulary manipulation tasks. In addition, we\ndemonstrate Splat-MOVER in four multi-stage manipulation tasks, using the edited\nscene to reflect changes due to prior manipulation stages, which is not possible\nwith existing baselines. Video demonstrations and the code for the project are \navailable at https://splatmover.github.io.", "keywords": "Gaussian Splatting;Robotic Grasping;Robotic Manipulation;Scene Editing", "primary_area": "", "supplementary_material": "/attachment/3aee93ae7c2dba5136fb1543122a328fe3a6c623.zip", "author": "Olaolu Shorinwa;Johnathan Tucker;Aliyah Smith;Aiden Swann;Timothy Chen;Roya Firoozi;Monroe David Kennedy;Mac Schwager", "authorids": "~Olaolu_Shorinwa1;jatucker@stanford.edu;aliyah1@stanford.edu;~Aiden_Swann1;~Timothy_Chen1;rfiroozi@stanford.edu;~Monroe_David_Kennedy1;~Mac_Schwager1", "gender": ";;;;M;;M;M", "homepage": "https://msl.stanford.edu;;;http://aidenswann.com;;;https://arm.stanford.edu/;https://msl.stanford.edu/", "dblp": ";;;;;;;22/7012", "google_scholar": ";;;feH32sgAAAAJ;;;x2ZPRfoAAAAJ;-EqbTXoAAAAJ", "orcid": ";;;;0000-0003-3948-8739;;0000-0002-4567-0409;", "linkedin": ";;;;;;monroekennedy3/;", "or_profile": "~Olaolu_Shorinwa1;jatucker@stanford.edu;aliyah1@stanford.edu;~Aiden_Swann1;~Timothy_Chen1;rfiroozi@stanford.edu;~Monroe_David_Kennedy1;~Mac_Schwager1", "aff": "Stanford University;;;Stanford University;Stanford University;;Stanford University;Stanford University", "aff_domain": "stanford.edu;;;stanford.edu;stanford.edu;;stanford.edu;stanford.edu", "position": "Researcher;;;PhD student;PhD student;;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nshorinwa2024splatmover,\ntitle={Splat-{MOVER}: Multi-Stage, Open-Vocabulary Robotic Manipulation via Editable Gaussian Splatting},\nauthor={Olaolu Shorinwa and Johnathan Tucker and Aliyah Smith and Aiden Swann and Timothy Chen and Roya Firoozi and Monroe David Kennedy and Mac Schwager},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8XFT1PatHy}\n}", "github": "https://splatmover.github.io", "project": "", "reviewers": "JeoA;cpNM;wP4F", "site": "https://openreview.net/forum?id=8XFT1PatHy", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 23, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1474140134437468335&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "8Yu0TNJNGK", "title": "AnyRotate: Gravity-Invariant In-Hand Object Rotation with Sim-to-Real Touch", "track": "main", "status": "Poster", "tldr": "", "abstract": "Human hands are capable of in-hand manipulation in the presence of different hand motions. For a robot hand, harnessing rich tactile information to achieve this level of dexterity still remains a significant challenge. In this paper, we present AnyRotate, a system for gravity-invariant multi-axis in-hand object rotation using dense featured sim-to-real touch. We tackle this problem by training a dense tactile policy in simulation and present a sim-to-real method for rich tactile sensing to achieve zero-shot policy transfer. Our formulation allows the training of a unified policy to rotate unseen objects about arbitrary rotation axes in any hand direction. In our experiments, we highlight the benefit of capturing detailed contact information when handling objects of varying properties. Interestingly, we found rich multi-fingered tactile sensing can detect unstable grasps and provide a reactive behavior that improves the robustness of the policy.", "keywords": "Tactile Sensing;In-hand Object Rotation;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/278c01253ca0e1933845391d5486180c5d7c4ac0.zip", "author": "Max Yang;chenghua lu;Alex Church;Yijiong Lin;Christopher J. Ford;Haoran Li;Efi Psomopoulou;David A.W. Barton;Nathan F. Lepora", "authorids": "~Max_Yang1;~chenghua_lu1;alex.church@caint.io;~Yijiong_Lin2;~Christopher_J._Ford1;~Haoran_Li19;~Efi_Psomopoulou1;~David_A.W._Barton1;~Nathan_F._Lepora1", "gender": "M;F;;;M;M;F;M;", "homepage": "https://scholar.google.com/citations?user=WQQ1vz8AAAAJ&hl=en;;;https://yijionglin.github.io/;;https://haoran-li.com;https://efi-robotics.com;https://cityinthesky.co.uk/;https://www.lepora.com", "dblp": ";;;;;;;;76/10010", "google_scholar": ";https://scholar.google.com/citations?hl=zh-CN;;SQl1XCUAAAAJ;;;https://scholar.google.gr/citations?user=kVyW-LEAAAAJ;;fb2WiJgAAAAJ", "orcid": ";;;;;0000-0001-8815-073X;0000-0003-3883-4097;;", "linkedin": ";;;;chrisford123/;;efi-psomopoulou/;;", "or_profile": "~Max_Yang1;~chenghua_lu1;alex.church@caint.io;~Yijiong_Lin2;~Christopher_J._Ford1;~Haoran_Li19;~Efi_Psomopoulou1;~David_A.W._Barton1;~Nathan_F._Lepora1", "aff": "University of Bristol;University of Bristol;;University of Bristol;University of Bristol;University of Bristol;University of Bristol;University of Bristol;University of Bristol", "aff_domain": "bristol.ac.uk;bristol.ac.uk;;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk;bristol.ac.uk", "position": "PhD student;PhD student;;PhD student;PhD student;PhD student;Assistant Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nyang2024anyrotate,\ntitle={AnyRotate: Gravity-Invariant In-Hand Object Rotation with Sim-to-Real Touch},\nauthor={Max Yang and chenghua lu and Alex Church and Yijiong Lin and Christopher J. Ford and Haoran Li and Efi Psomopoulou and David A.W. Barton and Nathan F. Lepora},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=8Yu0TNJNGK}\n}", "github": "", "project": "", "reviewers": "9rrD;mNc5;Si75", "site": "https://openreview.net/forum?id=8Yu0TNJNGK", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;3", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=145481153800223908&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "University of Bristol", "aff_unique_dep": "", "aff_unique_url": "https://www.bristol.ac.uk", "aff_unique_abbr": "Bristol", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "928V4Umlys", "title": "DriveVLM: The Convergence of Autonomous Driving and Large Vision-Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "A primary hurdle of autonomous driving in urban environments is understanding complex and long-tail scenarios, such as challenging road conditions and delicate human behaviors. We introduce DriveVLM, an autonomous driving system leveraging Vision-Language Models (VLMs) for enhanced scene understanding and planning capabilities. DriveVLM integrates a unique combination of reasoning modules for scene description, scene analysis, and hierarchical planning. Furthermore, recognizing the limitations of VLMs in spatial reasoning and heavy computational requirements, we propose DriveVLM-Dual, a hybrid system that synergizes the strengths of DriveVLM with the traditional autonomous driving pipeline. Experiments on both the nuScenes dataset and our SUP-AD dataset demonstrate the efficacy of DriveVLM and DriveVLM-Dual in handling complex and unpredictable driving conditions. Finally, we deploy the DriveVLM-Dual on a production vehicle, verifying it is effective in real-world autonomous driving environments.", "keywords": "Autonomous Driving;Vision Language Model;Dual System", "primary_area": "", "supplementary_material": "/attachment/1b81b0d29e93277361dc141ffd2f5af9f3d0f07e.zip", "author": "Xiaoyu Tian;Junru Gu;Bailin Li;Yicheng Liu;Yang Wang;Zhiyong Zhao;Kun Zhan;Peng Jia;XianPeng Lang;Hang Zhao", "authorids": "~Xiaoyu_Tian3;~Junru_Gu1;~Bailin_Li3;~Yicheng_Liu2;~Yang_Wang56;~Zhiyong_Zhao2;~Kun_Zhan3;~Peng_Jia1;~XianPeng_Lang1;~Hang_Zhao1", "gender": "M;;M;M;M;M;M;M;M;M", "homepage": "https://github.com/134994;;;https://mrmoore98.github.io/liuyicheng/;https://github.com/Emilio66;https://zhankunliauto.github.io/;https://www.lixiang.com/#li;https://weibo.com/u/1409486097;http://www.mit.edu/~hangzhao/;https://github.com/kobeandhebe?tab=repositories", "dblp": ";;;;;;;184/2782.html;;", "google_scholar": ";FPvo18QAAAAJ;;vRmsgQUAAAAJ;;;Z_QY_VwAAAAJ;;DmahiOYAAAAJ;", "orcid": ";;;0000-0003-3211-3088;;;;;;", "linkedin": ";;libailin/;;;;;;;", "or_profile": "~Xiaoyu_Tian3;~Junru_Gu1;~Bailin_Li3;~Yicheng_Liu2;~Zhiyong_Zhao2;~Kun_Zhan3;~Peng_Jia1;~XianPeng_Lang1;~Hang_Zhao1;~YANG_WANG50", "aff": "IIIS, Tsinghua University;Tsinghua University;Li Auto;Tsinghua University;LiAuto;LiAuto;Li Auto;LiAuto;Tsinghua University;Li Auto", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;lixiang.com;mail.tsinghua.edu.cn;lixiang.com;liauto.com;lixiang.com;lixiang.com;tsinghua.edu.cn;lixiang.com", "position": "PhD student;PhD student;Software Engineer;PhD student;AI Architect;Engneering;Researcher;Vice President;Assistant Professor;Researcher", "bibtex": "@inproceedings{\ntian2024drivevlm,\ntitle={Drive{VLM}: The Convergence of Autonomous Driving and Large Vision-Language Models},\nauthor={Xiaoyu Tian and Junru Gu and Bailin Li and Yicheng Liu and Yang Wang and Zhiyong Zhao and Kun Zhan and Peng Jia and XianPeng Lang and Hang Zhao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=928V4Umlys}\n}", "github": "", "project": "", "reviewers": "1XCj;fMiT;cCjc", "site": "https://openreview.net/forum?id=928V4Umlys", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 190, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9069990263513405041&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;2;2;1;2;0;1", "aff_unique_norm": "Tsinghua University;Li Auto;LiAuto", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.liauto.com;https://www.liauto.com", "aff_unique_abbr": "THU;Li Auto;LiAuto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "97QXO0uBEO", "title": "Handling Long-Term Safety and Uncertainty in Safe Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Safety is one of the key issues preventing the deployment of reinforcement learning techniques in real-world robots. While most approaches in the Safe Reinforcement Learning area do not require prior knowledge of constraints and robot kinematics and rely solely on data, it is often difficult to deploy them in complex real-world settings. Instead, model-based approaches that incorporate prior knowledge of the constraints and dynamics into the learning framework have proven capable of deploying the learning algorithm directly on the real robot.\nUnfortunately, while an approximated model of the robot dynamics is often available, the safety constraints are task-specific and hard to obtain: they may be too complicated to encode analytically, too expensive to compute, or it may be difficult to envision a priori the long-term safety requirements. In this paper, we bridge this gap by extending the safe exploration method, ATACOM, with learnable constraints, with a particular focus on ensuring long-term safety and handling of uncertainty. Our approach is competitive or superior to state-of-the-art methods in final performance while maintaining safer behavior during training.", "keywords": "Safe Reinforcement Learning;Chance Constraint;Distributional RL", "primary_area": "", "supplementary_material": "/attachment/d59b2cb21876f93d73d9069d2a8215d0922524d0.zip", "author": "Jonas G\u00fcnster;Puze Liu;Jan Peters;Davide Tateo", "authorids": "~Jonas_G\u00fcnster1;~Puze_Liu1;~Jan_Peters3;~Davide_Tateo2", "gender": "M;M;M;M", "homepage": ";https://puzeliu.github.io/;https://www.jan-peters.net;https://www.ias.informatik.tu-darmstadt.de/Team/DavideTateo", "dblp": ";292/4069;p/JanPeters1;214/0808", "google_scholar": ";zg-FMloAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;https://scholar.google.it/citations?user=LGnu3SEAAAAJ", "orcid": ";0000-0001-6887-7704;0000-0002-5266-8091;0000-0002-7193-923X", "linkedin": "jonas-g%C3%BCnster-6b49a2186/;;janrpeters/;", "or_profile": "~Jonas_G\u00fcnster1;~Puze_Liu1;~Jan_Peters3;~Davide_Tateo2", "aff": "Technische Universit\u00e4t Darmstadt;TU Darmstadt;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": "MS student;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\ng{\\\"u}nster2024handling,\ntitle={Handling Long-Term Safety and Uncertainty in Safe Reinforcement Learning},\nauthor={Jonas G{\\\"u}nster and Puze Liu and Jan Peters and Davide Tateo},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=97QXO0uBEO}\n}", "github": "https://github.com/cube1324/d-atacom", "project": "", "reviewers": "ydKm;2U5V;PBPX", "site": "https://openreview.net/forum?id=97QXO0uBEO", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;3;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5378846234935308845&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "9HkElMlPbU", "title": "Contrastive Imitation Learning for Language-guided Multi-Task Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Developing robots capable of executing various manipulation tasks, guided by natural language instructions and visual observations of intricate real-world environments, remains a significant challenge in robotics. Such robot agents need to understand linguistic commands and distinguish between the requirements of different tasks. In this work, we present $\\mathtt{\\Sigma\\mbox{-}agent}$, an end-to-end imitation learning agent for multi-task robotic manipulation. $\\mathtt{\\Sigma\\mbox{-}agent}$ incorporates contrastive Imitation Learning (contrastive IL) modules to strengthen vision-language and current-future representations. An effective and efficient multi-view querying Transformer (MVQ-Former) for aggregating representative semantic information is introduced. $\\mathtt{\\Sigma\\mbox{-}agent}$ shows substantial improvement over state-of-the-art methods under diverse settings in 18 RLBench tasks, surpassing RVT by an average of 5.2% and 5.9% in 10 and 100 demonstration training, respectively. $\\mathtt{\\Sigma\\mbox{-}agent}$ also achieves 62% success rate with a single policy in 5 real-world manipulation tasks. The code will be released upon acceptance.", "keywords": "Contrastive imitation learning;Multi-task learning;Robotic manipulation", "primary_area": "", "supplementary_material": "/attachment/41e2e342a34331a80626426e9a2f5760692653d2.zip", "author": "Teli Ma;Jiaming Zhou;Zifan Wang;Ronghe Qiu;Junwei Liang", "authorids": "~Teli_Ma1;~Jiaming_Zhou1;~Zifan_Wang7;~Ronghe_Qiu2;~Junwei_Liang1", "gender": "M;M;M;M;M", "homepage": "https://teleema.github.io/;https://jiaming-zhou.github.io/;https://zifanw.notion.site/;https://github.com/ConnerQiu;https://junweiliang.me/", "dblp": "276/3611;;;;62/10704-1", "google_scholar": "tW37g0UAAAAJ;b3y40w8AAAAJ;GaJXZ-UAAAAJ;https://scholar.google.com/citations?hl=en;bMedjfUAAAAJ", "orcid": ";;;;0000-0003-2219-5569", "linkedin": ";;;;junweiliang/", "or_profile": "~Teli_Ma1;~Jiaming_Zhou1;~Zifan_Wang7;~Ronghe_Qiu2;~Junwei_Liang1", "aff": "Hong Kong University of Science and Technology;Hong Kong University of Science and Technology (Guangzhou);the Hong Kong University of Science and Technology\uff08Guangzhou);Hong Kong University of Science and Technology;Hong Kong University of Science and Technology", "aff_domain": "hkust-gz.edu.cn;hkust-gz.edu.cn;hkust-gz.edu.cn;hkust.edu;ust.hk", "position": "PhD student;PhD student;MS student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nma2024contrastive,\ntitle={Contrastive Imitation Learning for Language-guided Multi-Task Robotic Manipulation},\nauthor={Teli Ma and Jiaming Zhou and Zifan Wang and Ronghe Qiu and Junwei Liang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9HkElMlPbU}\n}", "github": "", "project": "", "reviewers": "Ejqe;HHmD;wN6C;h5Zm;H222;6fNC;9vfD", "site": "https://openreview.net/forum?id=9HkElMlPbU", "pdf_size": 0, "rating": "2;2;3;3;3;3;3", "confidence": "4;4;4;4;5;4;3", "rating_avg": 2.7142857142857144, "confidence_avg": 4.0, "replies_avg": 8, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=542421377238462220&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Hong Kong University of Science and Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.ust.hk", "aff_unique_abbr": "HKUST", "aff_campus_unique_index": "0;0;1;0;0", "aff_campus_unique": "Hong Kong SAR;Guangzhou", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "China" }, { "id": "9XV3dBqcfe", "title": "Generalized Animal Imitator: Agile Locomotion with Versatile Motion Prior", "track": "main", "status": "Poster", "tldr": "", "abstract": "The agility of animals, particularly in complex activities such as running, turning, jumping, and backflipping, stands as an exemplar for robotic system design. Transferring this suite of behaviors to legged robotic systems introduces essential inquiries: How can a robot be trained to learn multiple locomotion behaviors simultaneously? How can the robot execute these tasks with a smooth transition? How to integrate these skills for wide-range applications? This paper introduces the Versatile Instructable Motion prior (VIM) \u2013 a Reinforcement Learning framework designed to incorporate a range of agile locomotion tasks suitable for advanced robotic applications. Our framework enables legged robots to learn diverse agile low-level skills by imitating animal motions and manually designed motions. Our Functionality reward guides the robot's ability to adopt varied skills, and our Stylization reward ensures that robot motions align with reference motions. Our evaluations of the VIM framework span both simulation environments and real-world deployment. To the best of our knowledge, this is the first work that allows a robot to concurrently learn diverse agile locomotion skills using a single learning-based controller in the real world.", "keywords": "Legged Robots;Imitation Learning;Agile Locomotion", "primary_area": "", "supplementary_material": "/attachment/38f42c37a4ffe3376ccbbaef5a83c7167cea7566.zip", "author": "Ruihan Yang;Zhuoqun Chen;Jianhan Ma;Chongyi Zheng;Yiyu Chen;Quan Nguyen;Xiaolong Wang", "authorids": "~Ruihan_Yang2;~Zhuoqun_Chen1;~Jianhan_Ma1;~Chongyi_Zheng1;~Yiyu_Chen4;~Quan_Nguyen5;~Xiaolong_Wang3", "gender": "M;M;M;M;M;;M", "homepage": "http://rchalyang.github.io/;https://zhuoqun-chen.github.io;https://kingspenguin.github.io/;https://chongyi-zheng.github.io;https://www.linkedin.com/in/yiyu-chen-88079b161/;https://sites.usc.edu/quann/;https://xiaolonw.github.io/", "dblp": ";;;250/9267;;;91/952-4", "google_scholar": "b-o1o7cAAAAJ;Qpfxtc8AAAAJ;uJvw2KUAAAAJ;bezWXYcAAAAJ;;s5_69i0AAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;;;", "linkedin": ";zhuoqun-chen/;jianhan-ma-ab675922a/;;;qtn/;", "or_profile": "~Ruihan_Yang2;~Zhuoqun_Chen1;~Jianhan_Ma1;~Chongyi_Zheng1;~Yiyu_Chen4;~Quan_Nguyen5;~Xiaolong_Wang3", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;Princeton University;University of Southern California;University of Southern California;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;princeton.edu;usc.edu;usc.edu;ucsd.edu", "position": "PhD student;MS student;MS student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024generalized,\ntitle={Generalized Animal Imitator: Agile Locomotion with Versatile Motion Prior},\nauthor={Ruihan Yang and Zhuoqun Chen and Jianhan Ma and Chongyi Zheng and Yiyu Chen and Quan Nguyen and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9XV3dBqcfe}\n}", "github": "", "project": "", "reviewers": "wyry;M6Lk;wE5L", "site": "https://openreview.net/forum?id=9XV3dBqcfe", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16838646445654002000&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;2;2;0", "aff_unique_norm": "University of California, San Diego;Princeton University;University of Southern California", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ucsd.edu;https://www.princeton.edu;https://www.usc.edu", "aff_unique_abbr": "UCSD;Princeton;USC", "aff_campus_unique_index": "0;0;0;2;2;0", "aff_campus_unique": "San Diego;;Los Angeles", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "9aZ4ehSTRc", "title": "Guided Reinforcement Learning for Robust Multi-Contact Loco-Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Reinforcement learning (RL) has shown remarkable proficiency in developing robust control policies for contact-rich applications. However, it typically requires meticulous Markov Decision Process (MDP) designing tailored to each task and robotic platform. This work addresses this challenge by creating a systematic approach to behavior synthesis and control for multi-contact loco-manipulation.\nWe define a task-independent MDP formulation to learn robust RL policies using a single demonstration (per task) generated from a fast model-based trajectory optimization method. Our framework is validated on diverse real-world tasks, such as navigating spring-loaded doors and manipulating heavy dishwashers. The learned behaviors can handle dynamic uncertainties and external disturbances, showcasing recovery maneuvers, such as re-grasping objects during execution. Finally, we successfully transfer the policies to a real robot, demonstrating the approach's practical viability.", "keywords": "Whole-body Loco-Manipulation;Reinforcement Learning;Legged Mobile Manipulators", "primary_area": "", "supplementary_material": "/attachment/c65c3ebdae36cddc775c41a02304da8cf5af0ee0.zip", "author": "Jean Pierre Sleiman;Mayank Mittal;Marco Hutter", "authorids": "~Jean_Pierre_Sleiman1;~Mayank_Mittal1;~Marco_Hutter1", "gender": "M;M;M", "homepage": ";https://mayankm96.github.io;http://www.rsl.ethz.ch", "dblp": ";;04/2753", "google_scholar": "FW-U2M8AAAAJ;iVXG-IkAAAAJ;https://scholar.google.ch/citations?user=DO3quJYAAAAJ", "orcid": "0000-0002-9935-8787;;0000-0002-4285-4990", "linkedin": ";mayankm-0096/;", "or_profile": "~Jean_Pierre_Sleiman1;~Mayank_Mittal1;~Marco_Hutter1", "aff": "ETHZ - ETH Zurich;NVIDIA;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;nvidia.com;ethz.ch", "position": "PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\nsleiman2024guided,\ntitle={Guided Reinforcement Learning for Robust Multi-Contact Loco-Manipulation},\nauthor={Jean Pierre Sleiman and Mayank Mittal and Marco Hutter},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9aZ4ehSTRc}\n}", "github": "", "project": "", "reviewers": "kZSQ;1Sf9;9jLh", "site": "https://openreview.net/forum?id=9aZ4ehSTRc", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1629906603140603384&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0", "aff_unique_norm": "ETH Zurich;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.ethz.ch;https://www.nvidia.com", "aff_unique_abbr": "ETHZ;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0", "aff_country_unique": "Switzerland;United States" }, { "id": "9dsBQhoqVr", "title": "Fleet Supervisor Allocation: A Submodular Maximization Approach", "track": "main", "status": "Poster", "tldr": "", "abstract": "In real-world scenarios, the data collected by robots in diverse and unpredictable environments is crucial for enhancing their perception and decision-making models. This data is predominantly collected under human supervision, particularly through imitation learning (IL), where robots learn complex tasks by observing human supervisors. However, the deployment of multiple robots and supervisors to accelerate the learning process often leads to data redundancy and inefficiencies, especially as the scale of robot fleets increases. Moreover, the reliance on teleoperation for supervision introduces additional challenges due to potential network connectivity issues. \nTo address these issues in data collection, we introduce an Adaptive Submodular Allocation policy, ASA, designed for efficient human supervision allocation within multi-robot systems under uncertain connectivity conditions. Our approach reduces data redundancy by balancing the informativeness and diversity of data collection, and is capable of accommodating connectivity variances. We evaluate the effectiveness of ASA in simulations with 100 robots across four different environments and various network settings, including a real-world teleoperation scenario over a 5G network. We train and test our policy, ASA, and state-of-the-art policies utilizing NVIDIA's Isaac Gym. Our results show that ASA enhances the return on human effort by up to $3.37\\times$, outperforming current baselines in all simulated scenarios and providing robustness against connectivity disruptions.", "keywords": "Imitation Learning;Submodular Maximization;Fleet Learning", "primary_area": "", "supplementary_material": "/attachment/951a39444173d20924a61c00f4353598eda9517d.zip", "author": "Oguzhan Akcin;Ahmet Ege Tanriverdi;Kaan Kale;Sandeep P. Chinchali", "authorids": "~Oguzhan_Akcin2;~Ahmet_Ege_Tanriverdi1;~Kaan_Kale1;~Sandeep_P._Chinchali1", "gender": "M;M;M;", "homepage": ";;;https://www.ece.utexas.edu/people/faculty/sandeep-chinchali", "dblp": "311/3023;;;85/8366", "google_scholar": "2elIEXoAAAAJ;;oGQHjioAAAAJ;262ASa4AAAAJ", "orcid": ";;;", "linkedin": "oguzhan-akcin-0907/;ahmet-ege-tanriverdi-a7b81a219;kaan-kale-209843164/;", "or_profile": "~Oguzhan_Akcin2;~Ahmet_Ege_Tanriverdi1;~Kaan_Kale1;~Sandeep_Chinchali1", "aff": "The University of Texas at Austin;Bogazici University;Bogazici University;University of Texas at Austin", "aff_domain": "utexas.edu;std.bogazici.edu.tr;bogazici.edu.tr;utexas.edu", "position": "PhD student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nakcin2024fleet,\ntitle={Fleet Supervisor Allocation: A Submodular Maximization Approach},\nauthor={Oguzhan Akcin and Ahmet Ege Tanriverdi and Kaan Kale and Sandeep P. Chinchali},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9dsBQhoqVr}\n}", "github": "https://github.com/UTAustin-SwarmLab/Fleet-Supervisor-Allocation", "project": "", "reviewers": "ZFVB;rf9j;hzFg", "site": "https://openreview.net/forum?id=9dsBQhoqVr", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:5hhy4TSaWlAJ:scholar.google.com/&scioq=Fleet+Supervisor+Allocation:+A+Submodular+Maximization+Approach&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;0", "aff_unique_norm": "University of Texas at Austin;Bogazici University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.boun.edu.tr", "aff_unique_abbr": "UT Austin;BU", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;1;1;0", "aff_country_unique": "United States;T\u00fcrkiye" }, { "id": "9iG3SEbMnL", "title": "ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Representing robotic manipulation tasks as constraints that associate the robot and the environment is a promising way to encode desired robot behaviors. However, it remains unclear how to formulate the constraints such that they are 1) versatile to diverse tasks, 2) free of manual labeling, and 3) optimizable by off-the-shelf solvers to produce robot actions in real-time. In this work, we introduce Relational Keypoint Constraints (ReKep), a visually-grounded representation for constraints in robotic manipulation. Specifically, ReKep are expressed as Python functions mapping a set of 3D keypoints in the environment to a numerical cost. We demonstrate that by representing a manipulation task as a sequence of Relational Keypoint Constraints, we can employ a hierarchical optimization procedure to solve for robot actions (represented by a sequence of end-effector poses in SE(3)) with a perception-action loop at a real-time frequency. Furthermore, in order to circumvent the need for manual specification of ReKep for each new task, we devise an automated procedure that leverages large vision models and vision-language models to produce ReKep from free-form language instructions and RGB-D observation. We present system implementations on a mobile single-arm platform and a stationary dual-arm platform that can perform a large variety of manipulation tasks, featuring multi-stage, in-the-wild, bimanual, and reactive behaviors, all without task-specific data or environment models.", "keywords": "Structural Representation;Model-Based Planning;Foundation Models", "primary_area": "", "supplementary_material": "/attachment/73867ed05093550cd8131231d2eece8f1ef0ec1c.zip", "author": "Wenlong Huang;Chen Wang;Yunzhu Li;Ruohan Zhang;Li Fei-Fei", "authorids": "~Wenlong_Huang1;~Chen_Wang16;~Yunzhu_Li1;~Ruohan_Zhang1;~Li_Fei-Fei1", "gender": "M;M;M;M;F", "homepage": "https://wenlong.page;http://www.chenwangjeremy.net/;https://yunzhuli.github.io/;https://ai.stanford.edu/~zharu/;https://profiles.stanford.edu/fei-fei-li", "dblp": "82/2872;;182/1831;;79/2528", "google_scholar": "hYVMrzsAAAAJ;lStkAzsAAAAJ;WlA92lcAAAAJ;-bqvNWoAAAAJ;rDfyQnIAAAAJ", "orcid": ";;;;", "linkedin": ";;;;fei-fei-li-4541247/", "or_profile": "~Wenlong_Huang1;~Chen_Wang16;~Yunzhu_Li1;~Ruohan_Zhang1;~Li_Fei-Fei1", "aff": "NVIDIA;Computer Science Department, Stanford University;University of Illinois Urbana-Champaign;Stanford University;Stanford University", "aff_domain": "nvidia.com;cs.stanford.edu;illinois.edu;stanford.edu;stanford.edu", "position": "Intern;PhD student;Assistant Professor;Postdoc;Full Professor", "bibtex": "@inproceedings{\nhuang2024rekep,\ntitle={ReKep: Spatio-Temporal Reasoning of Relational Keypoint Constraints for Robotic Manipulation},\nauthor={Wenlong Huang and Chen Wang and Yunzhu Li and Ruohan Zhang and Li Fei-Fei},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9iG3SEbMnL}\n}", "github": "https://github.com/huangwl18/ReKep", "project": "", "reviewers": "JmTk;1ZB8;v9TF;yq2j", "site": "https://openreview.net/forum?id=9iG3SEbMnL", "pdf_size": 0, "rating": "1;3;3;3", "confidence": "4;5;4;4", "rating_avg": 2.5, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 97, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11870520317462391689&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "NVIDIA;Stanford University;University of Illinois Urbana-Champaign", "aff_unique_dep": "NVIDIA Corporation;Computer Science Department;", "aff_unique_url": "https://www.nvidia.com;https://www.stanford.edu;https://illinois.edu", "aff_unique_abbr": "NVIDIA;Stanford;UIUC", "aff_campus_unique_index": "1;2;1;1", "aff_campus_unique": ";Stanford;Urbana-Champaign", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "9jJP2J1oBP", "title": "Leveraging Mutual Information for Asymmetric Learning under Partial Observability", "track": "main", "status": "Poster", "tldr": "", "abstract": "Even though partial observability is prevalent in robotics, most reinforcement learning studies avoid it due to the difficulty of learning a policy that can efficiently memorize past events and seek information. Fortunately, in many cases, learning can be done in an asymmetric setting where states are available during training but not during execution. Prior studies often leverage the state to indirectly influence the training of a history-based actor (actor-critic methods) or a history-based critic (value-based methods). Instead, we propose using state-observation and state-history mutual information to improve the agent's architecture and ability to seek information and memorize efficiently through intrinsic rewards and an auxiliary task. Our method outperforms strong baselines through extensive experiments and achieves successful sim-to-real transfers to a real robot.", "keywords": "Partial Observability;Mutual Information;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/39344a49b05e52800ba528bf1df6d7295295085f.zip", "author": "Hai Huu Nguyen;Long Dinh Van The;Christopher Amato;Robert Platt", "authorids": "~Hai_Huu_Nguyen1;~Long_Dinh_Van_The1;~Christopher_Amato1;~Robert_Platt1", "gender": "M;M;M;", "homepage": "https://hai-h-nguyen.github.io/;https://longdvt.github.io/;http://www.ccs.neu.edu/home/camato/index.html;http://www.ccs.neu.edu/home/rplatt/", "dblp": ";;10/3254;39/5434", "google_scholar": "5b9ncWoAAAAJ;540Phy0AAAAJ;-8-sD-sAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;;", "linkedin": ";longdinh007/;;", "or_profile": "~Hai_Huu_Nguyen1;~Long_Dinh_Van_The1;~Christopher_Amato1;~Robert_Platt1", "aff": "Northeastern University;Hanoi University of Science and Technology;Northeastern University;Northeastern University", "aff_domain": "northeastern.edu;hust.edu.vn;northeastern.edu;neu.edu", "position": "PhD student;Undergrad student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nnguyen2024leveraging,\ntitle={Leveraging Mutual Information for Asymmetric Learning under Partial Observability},\nauthor={Hai Huu Nguyen and Long Dinh Van The and Christopher Amato and Robert Platt},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=9jJP2J1oBP}\n}", "github": "https://sites.google.com/view/mi-asym-pomdp", "project": "", "reviewers": "VPtp;TSYf;p5c4", "site": "https://openreview.net/forum?id=9jJP2J1oBP", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:J3fElbq-SZUJ:scholar.google.com/&scioq=Leveraging+Mutual+Information+for+Asymmetric+Learning+under+Partial+Observability&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Northeastern University;Hanoi University of Science and Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.northeastern.edu;https://www.hust.edu.vn", "aff_unique_abbr": "NEU;HUST", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hanoi", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;Vietnam" }, { "id": "A1hpY5RNiH", "title": "What Makes Pre-Trained Visual Representations Successful for Robust Manipulation?", "track": "main", "status": "Poster", "tldr": "", "abstract": "Inspired by the success of transfer learning in computer vision, roboticists have investigated visual pre-training as a means to improve the learning efficiency and generalization ability of policies learned from pixels. To that end, past work has favored large object interaction datasets, such as first-person videos of humans completing diverse tasks, in pursuit of manipulation-relevant features. Although this approach improves the efficiency of policy learning, it remains unclear how reliable these representations are in the presence of distribution shifts that arise commonly in robotic applications. Surprisingly, we find that visual representations designed for control tasks do not necessarily generalize under subtle changes in lighting and scene texture or the introduction of distractor objects. To understand what properties _do_ lead to robust representations, we compare the performance of 15 pre-trained vision models under different visual appearances. We find that emergent segmentation ability is a strong predictor of out-of-distribution generalization among ViT models. The rank order induced by this metric is more predictive than metrics that have previously guided generalization research within computer vision and machine learning, such as downstream ImageNet accuracy, in-domain accuracy, or shape-bias as evaluated by cue-conflict performance. We test this finding extensively on a suite of distribution shifts in ten tasks across two simulated manipulation environments. On the ALOHA setup, segmentation score predicts real-world performance after offline training with 50 demonstrations.", "keywords": "representation learning;manipulation;visual features", "primary_area": "", "supplementary_material": "/attachment/575e2da8ba3d7fd7c1a6ed8bca295f2216d30995.zip", "author": "Kaylee Burns;Zach Witzel;Jubayer Ibn Hamid;Tianhe Yu;Chelsea Finn;Karol Hausman", "authorids": "~Kaylee_Burns2;~Zach_Witzel1;~Jubayer_Ibn_Hamid1;~Tianhe_Yu1;~Chelsea_Finn1;~Karol_Hausman2", "gender": "F;;;M;F;", "homepage": "https://kayburns.github.io;;;https://cs.stanford.edu/~tianheyu/;https://ai.stanford.edu/~cbfinn/;", "dblp": "217/3002;;;192/1797;131/1783;", "google_scholar": "N_rVVG8AAAAJ;;;;vfPE6hgAAAAJ;", "orcid": ";;;;;", "linkedin": ";zachwitz;;;;", "or_profile": "~Kaylee_Burns2;~Zach_Witzel1;~Jubayer_Ibn_Hamid1;~Tianhe_Yu1;~Chelsea_Finn1;~Karol_Hausman2", "aff": "Stanford University;Stanford University;;Google Brain;Google;", "aff_domain": "stanford.edu;stanford.edu;;google.com;google.com;", "position": "PhD student;Undergrad student;;Research Scientist;Research Scientist;", "bibtex": "@inproceedings{\nburns2024what,\ntitle={What Makes Pre-Trained Visual Representations Successful for Robust Manipulation?},\nauthor={Kaylee Burns and Zach Witzel and Jubayer Ibn Hamid and Tianhe Yu and Chelsea Finn and Karol Hausman},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A1hpY5RNiH}\n}", "github": "", "project": "", "reviewers": "hDPj;cqr9;nqHm", "site": "https://openreview.net/forum?id=A1hpY5RNiH", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;3;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4951091306662038832&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google Brain", "aff_unique_url": "https://www.stanford.edu;https://brain.google.com", "aff_unique_abbr": "Stanford;Google Brain", "aff_campus_unique_index": "0;0;1;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "A6ikGJRaKL", "title": "KOROL: Learning Visualizable Object Feature with Koopman Operator Rollout for Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning dexterous manipulation skills presents significant challenges due to complex nonlinear dynamics that underlie the interactions between objects and multi-fingered hands. Koopman operators have emerged as a robust method for modeling such nonlinear dynamics within a linear framework.\nHowever, current methods rely on runtime access to ground-truth (GT) object states, making them unsuitable for vision-based practical applications.\nUnlike image-to-action policies that implicitly learn visual features for control, we use a dynamics model, specifically the Koopman operator, to learn visually interpretable object features critical for robotic manipulation within a scene.\nWe construct a Koopman operator using object features predicted by a feature extractor and utilize it to auto-regressively advance system states. We train the feature extractor to embed scene information into object features, thereby enabling the accurate propagation of robot trajectories.\nWe evaluate our approach on simulated and real-world robot tasks, with results showing that it outperformed the model-based imitation learning NDP by 1.08$\\times$ and the image-to-action Diffusion Policy by 1.16$\\times$. The results suggest that our method maintains task success rates with learned features and extends applicability to real-world manipulation without GT object states. Project video and code are available at: https://github.com/hychen-naza/KOROL.", "keywords": "Manipulation;Koopman Operator;Visual Representation Learning", "primary_area": "", "supplementary_material": "/attachment/49581c3230a9894bb2c111223d4ba59d204f6943.zip", "author": "Hongyi Chen;ABULIKEMU ABUDUWEILI;Aviral Agrawal;Yunhai Han;Harish Ravichandar;Changliu Liu;Jeffrey Ichnowski", "authorids": "~Hongyi_Chen2;~ABULIKEMU_ABUDUWEILI1;~Aviral_Agrawal1;~Yunhai_Han1;~Harish_Ravichandar1;~Changliu_Liu1;~Jeffrey_Ichnowski1", "gender": "M;M;M;M;;F;M", "homepage": "https://hychen-naza.github.io/;https://walleclipse.github.io/;https://aviral-agrawal.github.io;https://y8han.github.io/;http://harishravichandar.com/;http://www.cs.cmu.edu/~cliu6/index.html;https://ichnow.ski", "dblp": "49/5284;245/8652;;276/6126;237/9959;166/3563;89/1741", "google_scholar": "https://scholar.google.com/citations?view_op=list_works;6Oro5g8AAAAJ;NiW49k4AAAAJ;lsN3nY8AAAAJ;d2HP6SMAAAAJ;;1OdtfywAAAAJ", "orcid": ";;;;0000-0002-6635-2637;;0000-0003-4874-9478", "linkedin": ";;aviral-agrawal-783a01162/;;;;", "or_profile": "~Hongyi_Chen2;~ABULIKEMU_ABUDUWEILI1;~Aviral_Agrawal1;~Yunhai_Han1;~Harish_Ravichandar1;~Changliu_Liu1;~Jeffrey_Ichnowski1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Georgia Institute of Technology;Georgia Institute of Technology;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;andrew.cmu.edu;andrew.cmu.edu;gatech.edu;gatech.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;MS student;PhD student;Assistant Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nchen2024korol,\ntitle={{KOROL}: Learning Visualizable Object Feature with Koopman Operator Rollout for Manipulation},\nauthor={Hongyi Chen and ABULIKEMU ABUDUWEILI and Aviral Agrawal and Yunhai Han and Harish Ravichandar and Changliu Liu and Jeffrey Ichnowski},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=A6ikGJRaKL}\n}", "github": "https://github.com/hychen-naza/KOROL", "project": "", "reviewers": "aTms;ekAf;wypC", "site": "https://openreview.net/forum?id=A6ikGJRaKL", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12215007272326697155&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.gatech.edu", "aff_unique_abbr": "CMU;Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "AEq0onGrN2", "title": "Physically Embodied Gaussian Splatting: A Visually Learnt and Physically Grounded 3D Representation for Robotics", "track": "main", "status": "Poster", "tldr": "", "abstract": "For robots to robustly understand and interact with the physical world, it is highly beneficial to have a comprehensive representation -- modelling geometry, physics, and visual observations -- that informs perception, planning, and control algorithms. We propose a novel dual \"Gaussian-Particle\" representation that models the physical world while (i) enabling predictive simulation of future states and (ii) allowing online correction from visual observations in a dynamic world. Our representation comprises particles that capture the geometrical aspect of objects in the world and can be used alongside a particle-based physics system to anticipate physically plausible future states. Attached to these particles are 3D Gaussians that render images from any viewpoint through a splatting process thus capturing the visual state. By comparing the predicted and observed images, our approach generates \"visual forces\" that correct the particle positions while respecting known physical constraints. By integrating predictive physical modeling with continuous visually-derived corrections, our unified representation reasons about the present and future while synchronizing with reality. We validate our approach on 2D and 3D tracking tasks as well as photometric reconstruction quality. Videos are found at https://embodied-gaussians.github.io/", "keywords": "3D Representation;Gaussian Splatting;Robotics;Tracking;Physics", "primary_area": "", "supplementary_material": "/attachment/97e886f5218e2ab43716005edaf75b31d6bf5956.zip", "author": "Jad Abou-Chakra;Krishan Rana;Feras Dayoub;Niko Suenderhauf", "authorids": "~Jad_Abou-Chakra1;~Krishan_Rana1;~Feras_Dayoub1;~Niko_Suenderhauf1", "gender": "M;M;M;M", "homepage": ";https://krishanrana.github.io/;http://www.ferasdayoub.com/;http://nikosuenderhauf.info", "dblp": ";70/4142;98/978;", "google_scholar": ";-hYjPxsAAAAJ;https://scholar.google.com.au/citations?user=Lzs8CuEAAAAJ;https://scholar.google.com.au/citations?user=WnKjfFEAAAAJ", "orcid": "0000-0002-9122-3132;0000-0002-9028-9295;0000-0002-4234-7374;", "linkedin": ";krishanrana/;feras-dayoub-6b8454114/;nikosuenderhauf/", "or_profile": "~Jad_Abou-Chakra1;~Krishan_Rana1;~Feras_Dayoub1;~Niko_Suenderhauf1", "aff": "Queensland University of Technology;Queensland University of Technology;University of Adelaide;Queensland University of Technology", "aff_domain": "qut.edu.au;qut.edu.au;adelaide.edu.au;qut.edu.au", "position": "PhD student;Postdoc;Associate Professor;Full Professor", "bibtex": "@inproceedings{\nabou-chakra2024physically,\ntitle={Physically Embodied Gaussian Splatting: A Visually Learnt and Physically Grounded 3D Representation for Robotics},\nauthor={Jad Abou-Chakra and Krishan Rana and Feras Dayoub and Niko Suenderhauf},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AEq0onGrN2}\n}", "github": "", "project": "", "reviewers": "ud25;iPN3;pZSN", "site": "https://openreview.net/forum?id=AEq0onGrN2", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14189348868082370627&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Queensland University of Technology;University of Adelaide", "aff_unique_dep": ";", "aff_unique_url": "https://www.qut.edu.au;https://www.adelaide.edu.au", "aff_unique_abbr": "QUT;Adelaide", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Australia" }, { "id": "AGG1zlrrMw", "title": "Neural Attention Field: Emerging Point Relevance in 3D Scenes for One-Shot Dexterous Grasping", "track": "main", "status": "Poster", "tldr": "", "abstract": "One-shot transfer of dexterous grasps to novel scenes with object and context variations has been a challenging problem. While distilled feature fields from large vision models have enabled semantic correspondences across 3D scenes, their features are point-based and restricted to object surfaces, limiting their capability of modeling complex semantic feature distributions for hand-object interactions. In this work, we propose the *neural attention field* for representing semantic-aware dense feature fields in the 3D space by modeling inter-point relevance instead of individual point features. Core to it is a transformer decoder that computes the cross-attention between any 3D query point with all the scene points, and provides the query point feature with an attention-based aggregation. We further propose a self-supervised framework for training the transformer decoder from only a few 3D pointclouds without hand demonstrations. Post-training, the attention field can be applied to novel scenes for semantics-aware dexterous grasping from one-shot demonstration. Experiments show that our method provides better optimization landscapes by encouraging the end-effector to focus on task-relevant scene regions, resulting in significant improvements in success rates on real robots compared with the feature-field-based methods.", "keywords": "Desterous Grasping;One-Shot Manipulation;Distilled Feature Field;Neural Implicit Field;Self-Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/4b5c004e23e9bb3b7f5794c0d82f478b4f42e9f8.zip", "author": "Qianxu Wang;Congyue Deng;Tyler Ga Wei Lum;Yuanpei Chen;Yaodong Yang;Jeannette Bohg;Yixin Zhu;Leonidas Guibas", "authorids": "~Qianxu_Wang1;~Congyue_Deng1;~Tyler_Ga_Wei_Lum1;~Yuanpei_Chen2;~Yaodong_Yang1;~Jeannette_Bohg1;~Yixin_Zhu1;~Leonidas_Guibas1", "gender": ";F;M;M;M;;M;M", "homepage": "https://github.com/Halowangqx/Halowangqx.github.io;https://cs.stanford.edu/~congyue/;https://tylerlum.github.io/;https://cypypccpy.github.io/;https://www.yangyaodong.com;https://web.stanford.edu/~bohg/;https://yzhu.io/;http://geometry.stanford.edu/", "dblp": ";267/5521;;1234567;170/1496-1;52/7377;91/1103-1.html;g/LeonidasJGuibas", "google_scholar": ";XJZ8UBcAAAAJ;kPq6-XIAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;rjnJnEkAAAAJ;qG9l6JEAAAAJ;https://scholar.google.com.tw/citations?user=5JlEyTAAAAAJ", "orcid": ";;;0000-0002-0033-492X;0000-0001-8132-5613;0000-0002-4921-7193;0000-0001-7024-1545;", "linkedin": ";;tyler-lum/;;yaodong-yang;;;", "or_profile": "~Qianxu_Wang1;~Congyue_Deng1;~Tyler_Ga_Wei_Lum1;~Yuanpei_Chen2;~Yaodong_Yang1;~Jeannette_Bohg1;~Yixin_Zhu1;~Leonidas_Guibas1", "aff": "Peking University;Stanford University;Stanford University;PsiRobot;Peking University;Stanford University;Peking University;Stanford University", "aff_domain": "pku.edu.cn;stanford.edu;stanford.edu;psibot.ai;pku.edu.cn;stanford.edu;pku.edu.cn;stanford.edu", "position": "Undergrad student;PhD student;PhD student;Researcher;Assistant Professor;Assistant Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwang2024neural,\ntitle={Neural Attention Field: Emerging Point Relevance in 3D Scenes for One-Shot Dexterous Grasping},\nauthor={Qianxu Wang and Congyue Deng and Tyler Ga Wei Lum and Yuanpei Chen and Yaodong Yang and Jeannette Bohg and Yixin Zhu and Leonidas Guibas},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AGG1zlrrMw}\n}", "github": "", "project": "", "reviewers": "gKTk;36Qt;vfCH", "site": "https://openreview.net/forum?id=AGG1zlrrMw", "pdf_size": 0, "rating": "1;2;3", "confidence": "3;4;4", "rating_avg": 2.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2387023786737905532&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;0;1;0;1", "aff_unique_norm": "Peking University;Stanford University;PsiRobot", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.stanford.edu;", "aff_unique_abbr": "Peking U;Stanford;", "aff_campus_unique_index": "1;1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;1;1;0;1;0;1", "aff_country_unique": "China;United States;" }, { "id": "AhEE5wrcLU", "title": "Velociraptor: Leveraging Visual Foundation Models for Label-Free, Risk-Aware Off-Road Navigation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Traversability analysis in off-road regimes is a challenging task that requires understanding of multi-modal inputs such as camera and LiDAR. These measurements are often sparse, noisy, and difficult to interpret, particularly in the off-road setting. Existing systems are very engineering-intensive, often requiring hand-tuning of traversability rules and manual annotation of semantic labels. Furthermore, existing methods for analyzing traversability risk and uncertainty are computationally expensive or not well-calibrated. We propose Velociraptor, a traversability analysis system that performs [veloci]ty-informed, [r]isk-[a]ware [p]erception and [t]raversability for [o]ff-[r]oad driving without any human annotations. We achieve this via the use of visual foundation models (VFMs) and geometric mapping to produce a rich visual-geometric representation of the robot's local environment. We then leverage this representation to produce costmaps, speedmaps, and uncertainty maps using state-of-the-art fully self-supervised techniques. Our approach enables intelligent high-speed off-road navigation with zero human annotation, and with about forty minutes of expert data, outperforms several geometric and semantic traversability baselines, both in offline and real-world robot trials across multiple challenging off-road sites.", "keywords": "Field Robotics;Self-Supervised Learning;Visual Foundation Models", "primary_area": "", "supplementary_material": "/attachment/ab31fe2566e74c3ee0af29e0a42e7d5c9b4fbed2.zip", "author": "Samuel Triest;Matthew Sivaprakasam;Shubhra Aich;David Fan;Wenshan Wang;Sebastian Scherer", "authorids": "~Samuel_Triest1;~Matthew_Sivaprakasam1;~Shubhra_Aich1;~David_Fan1;~Wenshan_Wang2;~Sebastian_Scherer1", "gender": "M;M;M;;F;M", "homepage": ";https://matthewjsiv.github.io;https://littleaich.github.io/;https://scholar.google.com/citations?user=vbhA9hwAAAAJ&hl=en;http://www.wangwenshan.com;https://theairlab.org", "dblp": ";;168/3861;;;253/5743", "google_scholar": ";6PymfB8AAAAJ;https://scholar.google.ca/citations?user=kLYaMVIAAAAJ;;https://scholar.google.com/citations?hl=en;gxoPfIYAAAAJ", "orcid": ";;0000-0002-5117-5164;;;0000-0002-8373-4688", "linkedin": "striest/;;shubhra-aich/;;;sebastian-scherer-a026961a/", "or_profile": "~Samuel_Triest1;~Matthew_Sivaprakasam1;~Shubhra_Aich1;~David_Fan1;~Wenshan_Wang2;~Sebastian_Scherer1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University;Jet Propulsion Laboratory;School of Computer Science, Carnegie Mellon University;Near Earth Autonomy Inc.", "aff_domain": "andrew.cmu.edu;cmu.edu;cmu.edu;jpl.nasa.gov;cs.cmu.edu;nearearth.aero", "position": "PhD student;MS student;Researcher;Researcher;Researcher;Senior Scientist", "bibtex": "@inproceedings{\ntriest2024velociraptor,\ntitle={Velociraptor: Leveraging Visual Foundation Models for Label-Free, Risk-Aware Off-Road Navigation},\nauthor={Samuel Triest and Matthew Sivaprakasam and Shubhra Aich and David Fan and Wenshan Wang and Sebastian Scherer},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AhEE5wrcLU}\n}", "github": "", "project": "", "reviewers": "K4L2;Wz3t;XNej;yWjL", "site": "https://openreview.net/forum?id=AhEE5wrcLU", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "4;4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": -0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6977986742789665778&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0;2", "aff_unique_norm": "Carnegie Mellon University;Jet Propulsion Laboratory;Near Earth Autonomy", "aff_unique_dep": ";;", "aff_unique_url": "https://www.cmu.edu;https://www.jpl.nasa.gov;https://www.nearearthautonomy.com", "aff_unique_abbr": "CMU;JPL;NEA", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Pasadena;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "AsbyZRdqPv", "title": "Simple Masked Training Strategies Yield Control Policies That Are Robust to Sensor Failure", "track": "main", "status": "Poster", "tldr": "", "abstract": "Sensor failure is common when robots are deployed in the real world, as sensors naturally wear out over time. Such failures can lead to catastrophic outcomes, including damage to the robot from unexpected robot behaviors such as falling during walking. Previous work has tried to address this problem by recovering missing sensor values from the history of states or by adapting learned control policies to handle corrupted sensors through fine-tuning during deployment.\nIn this work, we propose training reinforcement learning (RL) policies that are robust to sensory failures. We use a multimodal encoder designed to account for these failures and a training strategy that randomly drops a subset of sensor modalities, similar to missing observations caused by failed sensors. We conduct evaluations across multiple tasks (bipedal locomotion and robotic manipulation) with varying robot embodiments in both simulation and the real world to demonstrate the effectiveness of our approach. Our results show that the proposed method produces robust RL policies that handle failures in both low-dimensional proprioceptive and high-dimensional visual modalities without a significant increase in training time or decrease in sample efficiency, making it a promising solution for learning RL policies robust to sensory failures.", "keywords": "Reinforcement Learning;Robustness;Sensorimotor Learning", "primary_area": "", "supplementary_material": "/attachment/53a37ce3bb7c10f95f5bfe1fe73dd23a4f8c94a4.zip", "author": "Skand Skand;Bikram Pandit;Chanho Kim;Li Fuxin;Stefan Lee", "authorids": "~Skand_Skand1;~Bikram_Pandit1;~Chanho_Kim2;~Li_Fuxin1;~Stefan_Lee1", "gender": "M;M;M;;", "homepage": "https://pvskand.github.io/;https://bikcrum.com/;;;", "dblp": "218/6010;;135/4905.html;;", "google_scholar": ";bikram.pandit;xARSfT4AAAAJ;;", "orcid": ";0009-0007-3601-6118;;;", "linkedin": ";bikcrum/;;;", "or_profile": "~Skand_Skand1;~Bikram_Pandit1;~Chanho_Kim2;~Li_Fuxin1;~Stefan_Lee1", "aff": "Oregon State University;Oregon State University;Oregon State University;;", "aff_domain": "oregonstate.edu;oregonstate.edu;oregonstate.edu;;", "position": "PhD student;MS student;Postdoc;;", "bibtex": "@inproceedings{\nskand2024simple,\ntitle={Simple Masked Training Strategies Yield Control Policies That Are Robust to Sensor Failure},\nauthor={Skand Skand and Bikram Pandit and Chanho Kim and Li Fuxin and Stefan Lee},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AsbyZRdqPv}\n}", "github": "", "project": "", "reviewers": "iBv4;SZc2;X6ey", "site": "https://openreview.net/forum?id=AsbyZRdqPv", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9413122997596092405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "", "aff_unique_url": "https://oregonstate.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "AuJnXGq3AL", "title": "Scaling Cross-Embodied Learning: One Policy for Manipulation, Navigation, Locomotion and Aviation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Modern machine learning systems rely on large datasets to attain broad generalization, and this often poses a challenge in robotic learning, where each robotic platform and task might have only a small dataset. By training a single policy across many different kinds of robots, a robotic learning method can leverage much broader and more diverse datasets, which in turn can lead to better generalization and robustness. However, training a single policy on multi-robot data is challenging because robots can have widely varying sensors, actuators, and control frequencies. We propose CrossFormer, a scalable and flexible transformer-based policy that can consume data from any embodiment. We train CrossFormer on the largest and most diverse dataset to date, 900K trajectories across 20 different robot embodiments. We demonstrate that the same network weights can control vastly different robots, including single and dual arm manipulation systems, wheeled robots, quadcopters, and quadrupeds. Unlike prior work, our model does not require manual alignment of the observation or action spaces. Extensive experiments in the real world show that our method matches the performance of specialist policies tailored for each embodiment, while also significantly outperforming the prior state of the art in cross-embodiment learning.", "keywords": "Imitation Learning;Cross-Embodiment", "primary_area": "", "supplementary_material": "/attachment/d5423814975ff6dde6f1d7bb5d0199a460c9812c.zip", "author": "Ria Doshi;Homer Rich Walke;Oier Mees;Sudeep Dasari;Sergey Levine", "authorids": "~Ria_Doshi1;~Homer_Rich_Walke1;~Oier_Mees1;~Sudeep_Dasari2;~Sergey_Levine1", "gender": ";M;M;M;M", "homepage": ";https://homerwalke.com;https://www.oiermees.com/;https://people.eecs.berkeley.edu/~svlevine/;https://sudeepdasari.github.io/", "dblp": ";279/6795;190/8659;80/7594;215/3640", "google_scholar": ";ZWH5jCwAAAAJ;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;8R35rCwAAAAJ;NpOg5soAAAAJ", "orcid": ";;;;", "linkedin": "riadoshi;;oier-mees-a3069488;;", "or_profile": "~Ria_Doshi1;~Homer_Rich_Walke1;~Oier_Mees1;~Sergey_Levine1;~KSudeep_Dasari1", "aff": "University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;Google;Carnegie Mellon University", "aff_domain": "berkeley.edu;berkeley.edu;eecs.berkeley.edu;google.com;cmu.edu", "position": "Undergrad student;PhD student;Postdoc;Research Scientist;PhD student", "bibtex": "@inproceedings{\ndoshi2024scaling,\ntitle={Scaling Cross-Embodied Learning: One Policy for Manipulation, Navigation, Locomotion and Aviation},\nauthor={Ria Doshi and Homer Rich Walke and Oier Mees and Sudeep Dasari and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AuJnXGq3AL}\n}", "github": "https://github.com/rail-berkeley/crossformer", "project": "", "reviewers": "Cs6U;ZGgs;xtGM;MK5T", "site": "https://openreview.net/forum?id=AuJnXGq3AL", "pdf_size": 0, "rating": "3;4;4;4", "confidence": "4;5;4;4", "rating_avg": 3.75, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1288216298584926140&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;2", "aff_unique_norm": "University of California, Berkeley;Google;Carnegie Mellon University", "aff_unique_dep": ";Google;", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com;https://www.cmu.edu", "aff_unique_abbr": "UC Berkeley;Google;CMU", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "AzP6kSEffm", "title": "Dynamics-Guided Diffusion Model for Sensor-less Robot Manipulator Design", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present Dynamics-Guided Diffusion Model (DGDM), a data-driven framework for generating task-specific manipulator designs without task-specific training. Given object shapes and task specifications, DGDM generates sensor-less manipulator designs that can blindly manipulate objects towards desired motions and poses using an open-loop parallel motion. This framework 1) flexibly represents manipulation tasks as interaction profiles, 2) represents the design space using a geometric diffusion model, and 3) efficiently searches this design space using the gradients provided by a dynamics network trained without any task information. We evaluate DGDM on various manipulation tasks ranging from shifting/rotating objects to converging objects to a specific pose. Our generated designs outperform optimization-based and unguided diffusion baselines relatively by 31.5\\% and 45.3\\% on average success rate. With the ability to generate a new design within 0.8s, DGDM facilitates rapid design iteration and enhances the adoption of data-driven approaches for robot mechanism design. Qualitative results are best viewed on our project website https://dgdmcorl.github.io.", "keywords": "manipulator design;hardware optimization;diffusion model", "primary_area": "", "supplementary_material": "/attachment/f8581099fbef79f675705005a41de8c3b6c3cb22.zip", "author": "Xiaomeng Xu;Huy Ha;Shuran Song", "authorids": "~Xiaomeng_Xu1;~Huy_Ha1;~Shuran_Song3", "gender": "F;M;F", "homepage": "https://xxm19.github.io/;https://www.cs.columbia.edu/~huy/;https://shurans.github.io/", "dblp": "160/9754;277/9554;", "google_scholar": "af_4iHYAAAAJ;-3-f_8YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Xiaomeng_Xu1;~Huy_Ha1;~Shuran_Song3", "aff": "Stanford University;Columbia University;Stanford University", "aff_domain": "stanford.edu;columbia.edu;stanford.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nxu2024dynamicsguided,\ntitle={Dynamics-Guided Diffusion Model for Sensor-less Robot Manipulator Design},\nauthor={Xiaomeng Xu and Huy Ha and Shuran Song},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=AzP6kSEffm}\n}", "github": "https://github.com/real-stanford/dgdm", "project": "", "reviewers": "NVZK;fWCT;uUTj", "site": "https://openreview.net/forum?id=AzP6kSEffm", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;2", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5203436027402487865&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;Columbia University", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.columbia.edu", "aff_unique_abbr": "Stanford;Columbia", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "B2X57y37kC", "title": "Learning to Look: Seeking Information for Decision Making via Policy Factorization", "track": "main", "status": "Poster", "tldr": "", "abstract": "Many robot manipulation tasks require active or interactive exploration behavior in order to be performed successfully. Such tasks are ubiquitous in embodied domains, where agents must actively search for the information necessary for each stage of a task, e.g., moving the head of the robot to find information relevant to manipulation, or in multi-robot domains, where one scout robot may search for the information that another robot needs to make informed decisions. We identify these tasks with a new type of problem, factorized Contextual Markov Decision Processes, and propose DISaM, a dual-policy solution composed of an information-seeking policy that explores the environment to find the relevant contextual information and an information-receiving policy that exploits the context to achieve the manipulation goal. This factorization allows us to train both policies separately, using the information-receiving one to provide reward to train the information-seeking policy. At test time, the dual agent balances exploration and exploitation based on the uncertainty the manipulation policy has on what the next best action is. We demonstrate the capabilities of our dual policy solution in five manipulation tasks that require information-seeking behaviors, both in simulation and in the real-world, where DISaM significantly outperforms existing methods. More information at https://robin-lab.cs.utexas.edu/learning2look/.", "keywords": "Active Vision;Manipulation;Interactive Perception", "primary_area": "", "supplementary_material": "/attachment/dead20101f90259e5e4e601a4793d9943951482b.zip", "author": "Shivin Dass;Jiaheng Hu;Ben Abbatematteo;Peter Stone;Roberto Mart\u00edn-Mart\u00edn", "authorids": "~Shivin_Dass2;~Jiaheng_Hu1;~Ben_Abbatematteo1;~Peter_Stone1;~Roberto_Mart\u00edn-Mart\u00edn1", "gender": "M;M;M;M;M", "homepage": "https://jiahenghu.github.io/;http://www.cs.utexas.edu/~pstone;https://robertomartinmartin.com/;https://babbatem.github.io/;https://shivindass.github.io/", "dblp": ";s/PeterStone;153/7670;265/7692;", "google_scholar": ";qnwjcfAAAAAJ;XOJE8OEAAAAJ;rz3VnGAAAAAJ;3q8ivkoAAAAJ", "orcid": ";0000-0002-6795-420X;0000-0002-9586-2759;;", "linkedin": ";;;;", "or_profile": "~Jiaheng_Hu1;~Peter_Stone1;~Roberto_Mart\u00edn-Mart\u00edn1;~Ben_M_Abbatematteo1;~shivin_dass1", "aff": "University of Texas at Austin;University of Texas, Austin;University of Texas at Austin;Brown University;University of Texas at Austin", "aff_domain": "utexas.edu;utexas.edu;utexas.edu;brown.edu;utexas.edu", "position": "PhD student;Full Professor;Assistant Professor;PhD student;PhD student", "bibtex": "@inproceedings{\ndass2024learning,\ntitle={Learning to Look: Seeking Information for Decision Making via Policy Factorization},\nauthor={Shivin Dass and Jiaheng Hu and Ben Abbatematteo and Peter Stone and Roberto Mart{\\'\\i}n-Mart{\\'\\i}n},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B2X57y37kC}\n}", "github": "", "project": "", "reviewers": "iUtd;ibsv;fHBz", "site": "https://openreview.net/forum?id=B2X57y37kC", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:jfzjqdlkXpcJ:scholar.google.com/&scioq=Learning+to+Look:+Seeking+Information+for+Decision+Making+via+Policy+Factorization&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of Texas at Austin;Brown University", "aff_unique_dep": ";", "aff_unique_url": "https://www.utexas.edu;https://www.brown.edu", "aff_unique_abbr": "UT Austin;Brown", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "B45HRM4Wb4", "title": "ResPilot: Teleoperated Finger Gaiting via Gaussian Process Residual Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Dexterous robot hand teleoperation allows for long-range transfer of human manipulation expertise, and could simultaneously provide a way for humans to teach these skills to robots. However, current methods struggle to reproduce the functional workspace of the human hand, often limiting them to simple grasping tasks. We present a novel method for finger-gaited manipulation with multi-fingered robot hands. Our method provides the operator enhanced flexibility in making contacts by expanding the reachable workspace of the robot hand through residual Gaussian Process learning. We also assist the operator in maintaining stable contacts with the object by allowing them to constrain fingertips of the hand to move in concert. Extensive quantitative evaluations show that our method significantly increases the reachable workspace of the robot hand and enables the completion of novel dexterous finger gaiting tasks.", "keywords": "Teleoperation;Dexterous Manipulation;Gaussian Process", "primary_area": "", "supplementary_material": "/attachment/885bfaf8308ea7374123678dc940db2b447b8224.zip", "author": "Patrick Naughton;Jinda Cui;Karankumar Patel;Soshi Iba", "authorids": "~Patrick_Naughton1;~Jinda_Cui1;~Karankumar_Patel1;~Soshi_Iba1", "gender": "M;M;;M", "homepage": "https://patricknaughton01.github.io/;https://www.jindacui.com;;", "dblp": "64/5929;;;", "google_scholar": "D6LNDvwAAAAJ;HfmpZZAAAAAJ;y1Rbz1QAAAAJ;", "orcid": ";;;0000-0001-5346-3218", "linkedin": "patrick-naughton-01/;;karan-patel-479960a3/;soshi-iba-7090467", "or_profile": "~Patrick_Naughton1;~Jinda_Cui1;~Karankumar_Patel1;~Soshi_Iba1", "aff": "Honda Research Institution US;Honda Research Institution US;Honda Research Institution US;Honda R&D", "aff_domain": "honda-ri.com;honda-ri.com;honda-ri.com;honda.co.jp", "position": "Intern;Researcher;Researcher;Researcher", "bibtex": "@inproceedings{\nnaughton2024respilot,\ntitle={ResPilot: Teleoperated Finger Gaiting via Gaussian Process Residual Learning},\nauthor={Patrick Naughton and Jinda Cui and Karankumar Patel and Soshi Iba},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B45HRM4Wb4}\n}", "github": "", "project": "", "reviewers": "zYVG;MZs1;5i54", "site": "https://openreview.net/forum?id=B45HRM4Wb4", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;2", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -1.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12953379904871698287&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Honda Research Institute;Honda Research and Development", "aff_unique_dep": "Honda Research Institute;", "aff_unique_url": "https://honda-ri.com;https://www.honda.com/", "aff_unique_abbr": "HRI;Honda R&D", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Japan" }, { "id": "B7Lf6xEv7l", "title": "DiffusionSeeder: Seeding Motion Optimization with Diffusion for Rapid Motion Planning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Running optimization across many parallel seeds leveraging GPU compute [2] have relaxed the need for a good initialization, but this can fail if the problem is highly non-convex as all seeds could get stuck in local minima. One such setting is collision-free motion optimization for robot manipulation, where optimization converges quickly on easy problems but struggle in obstacle dense environments (e.g., a cluttered cabinet or table). In these situations, graph based planning algorithms are called to obtain seeds, resulting significant slowdowns. We propose DiffusionSeeder, a diffusion based approach that generates trajectories to seed motion optimization for rapid robot motion planning. DiffusionSeeder takes the initial depth image observation of the scene and generates high quality, multi-modal trajectories that are then fine-tuned with few iterations of motion optimization. We integrated DiffusionSeeder with cuRobo, a GPU-accelerated motion optimization method, to generate the seed trajectories which results in 12x speed up on average, and 36x speed up for more complicated problems, while achieving 10% higher success rate in partially observed simulation environments. Our results prove the effectiveness of using diverse solutions from learned diffusion model. Physical experiments on a Franka robot demonstrate the sim2real transfer of DiffusionSeeder to the real robot, with an average success rate of 86% and planning time of 26ms, increasing on cuRobo by 51% higher success rate and 2.5x speed up. The code and the model weights will be available after publication.", "keywords": "Robot Motion Planning;Diffusion Model", "primary_area": "", "supplementary_material": "/attachment/cd3175849fc5d944ea34849aa92d156f795e8cdd.zip", "author": "Huang Huang;Balakumar Sundaralingam;Arsalan Mousavian;Adithyavairavan Murali;Ken Goldberg;Dieter Fox", "authorids": "~Huang_Huang1;~Balakumar_Sundaralingam1;~Arsalan_Mousavian1;~Adithyavairavan_Murali2;~Ken_Goldberg1;~Dieter_Fox1", "gender": ";M;M;M;M;M", "homepage": "https://sites.google.com/site/huanghuang9729/home;https://balakumar-s.github.io/;https://cs.gmu.edu/~amousavi/;http://adithyamurali.com;http://goldberg.berkeley.edu/;https://homes.cs.washington.edu/~fox/", "dblp": ";;164/8572;;g/KennethYGoldberg;f/DieterFox", "google_scholar": ";https://scholar.google.com/citations?hl=en;fcA9m88AAAAJ;Tjj8TZAAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ;DqXsbPAAAAAJ", "orcid": ";;;;0000-0001-6747-9499;", "linkedin": ";;;adithyamurali;goldbergken/;", "or_profile": "~Huang_Huang1;~Balakumar_Sundaralingam1;~Arsalan_Mousavian1;~Adithyavairavan_Murali2;~Ken_Goldberg1;~Dieter_Fox1", "aff": "University of California, Berkeley;NVIDIA;NVIDIA;;University of California, Berkeley;Department of Computer Science", "aff_domain": "berkeley.edu;nvidia.com;nvidia.com;;berkeley.edu;cs.washington.edu", "position": "PhD student;Research Scientist;Research Scientist;;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024diffusionseeder,\ntitle={DiffusionSeeder: Seeding Motion Optimization with Diffusion for Rapid Motion Planning},\nauthor={Huang Huang and Balakumar Sundaralingam and Arsalan Mousavian and Adithyavairavan Murali and Ken Goldberg and Dieter Fox},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=B7Lf6xEv7l}\n}", "github": "", "project": "", "reviewers": "oM7J;QoEm;kGxy", "site": "https://openreview.net/forum?id=B7Lf6xEv7l", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18390465273052660424&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;0;2", "aff_unique_norm": "University of California, Berkeley;NVIDIA;Unknown Institution", "aff_unique_dep": ";NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.nvidia.com;", "aff_unique_abbr": "UC Berkeley;NVIDIA;", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "BmvUg1FIWC", "title": "Neural Inverse Source Problem", "track": "main", "status": "Poster", "tldr": "", "abstract": "Reconstructing unknown external source functions is an important perception capability for a large range of robotics domains including manipulation, aerial, and underwater robotics. In this work, we propose a Physics-Informed Neural Network (PINN) based approach for solving the inverse source problems in robotics, jointly identifying unknown source functions and the complete state of a system given partial and noisy observations. Our approach demonstrates several advantages over prior works (Finite Element Methods (FEM) and data-driven approaches): it offers flexibility in integrating diverse constraints and boundary conditions; eliminates the need for complex discretizations (e.g., meshing); easily accommodates gradients from real measurements; and does not limit performance based on the diversity and quality of training data. We validate our method across three simulation and real-world scenarios involving up to 4th order partial differential equations (PDEs), constraints such as Signorini and Dirichlet, and various regression losses including Chamfer distance and L2 norm.", "keywords": "Inverse source problem;Physics informed neural network", "primary_area": "", "supplementary_material": "", "author": "Youngsun Wi;Jayjun Lee;Miquel Oller;Nima Fazeli", "authorids": "~Youngsun_Wi1;jayjun@umich.edu;~Miquel_Oller1;~Nima_Fazeli1", "gender": ";;;", "homepage": "https://www.mmintlab.com/;;;https://www.mmintlab.com", "dblp": ";;;", "google_scholar": ";;N8LKz0kAAAAJ;", "orcid": ";;;", "linkedin": "youngsun-wi-1332761a0/;;;", "or_profile": "~Youngsun_Wi1;jayjun@umich.edu;~Miquel_Oller1;~Nima_Fazeli1", "aff": "University of Michigan;;University of Michigan - Ann Arbor;University of Michigan", "aff_domain": "umich.edu;;umich.edu;umich.edu", "position": "PhD student;;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwi2024neural,\ntitle={Neural Inverse Source Problem},\nauthor={Youngsun Wi and Jayjun Lee and Miquel Oller and Nima Fazeli},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=BmvUg1FIWC}\n}", "github": "", "project": "", "reviewers": "FvMb;dC7P;D4sA", "site": "https://openreview.net/forum?id=BmvUg1FIWC", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1552664704132361321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "1", "aff_campus_unique": ";Ann Arbor", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "Bq4XOaU4sV", "title": "Bridging the Sim-to-Real Gap from the Information Bottleneck Perspective", "track": "main", "status": "Poster", "tldr": "", "abstract": "Reinforcement Learning (RL) has recently achieved remarkable success in robotic control. However, most works in RL operate in simulated environments where privileged knowledge (e.g., dynamics, surroundings, terrains) is readily available. Conversely, in real-world scenarios, robot agents usually rely solely on local states (e.g., proprioceptive feedback of robot joints) to select actions, leading to a significant sim-to-real gap. Existing methods address this gap by either gradually reducing the reliance on privileged knowledge or performing a two-stage policy imitation. However, we argue that these methods are limited in their ability to fully leverage the available privileged knowledge, resulting in suboptimal performance. In this paper, we formulate the sim-to-real gap as an information bottleneck problem and therefore propose a novel privileged knowledge distillation method called the Historical Information Bottleneck (HIB). In particular, HIB learns a privileged knowledge representation from historical trajectories by capturing the underlying changeable dynamic information. Theoretical analysis shows that the learned privileged knowledge representation helps reduce the value discrepancy between the oracle and learned policies. Empirical experiments on both simulated and real-world tasks demonstrate that HIB yields improved generalizability compared to previous methods.", "keywords": "Sim-to-Real;Information Bottleneck;Reinforcement Learning;Locomotion", "primary_area": "", "supplementary_material": "/attachment/e6700ea4ef905dbb0720aed959936a540ec7ef4b.zip", "author": "Haoran He;Peilin Wu;Chenjia Bai;Hang Lai;Lingxiao Wang;Ling Pan;Xiaolin Hu;Weinan Zhang", "authorids": "~Haoran_He1;~Peilin_Wu3;~Chenjia_Bai2;~Hang_Lai1;~Lingxiao_Wang6;~Ling_Pan1;~Xiaolin_Hu1;~Weinan_Zhang1", "gender": "M;;M;M;M;F;M;M", "homepage": "https://tinnerhrhe.github.io/;https://peilinwu.site/;https://baichenjia.github.io/;http://www.apexlab.org/members/laihang@apexlab.org;;https://ling-pan.github.io/;http://www.xlhu.cn/;http://wnzhang.net", "dblp": "299/7312;;247/1943;;140/1229;199/9303/;60/6028-1;28/10261-1", "google_scholar": "Z33PHQ0AAAAJ;https://scholar.google.com/citations?hl=zh-CN;Rm_1y2kAAAAJ;;;qZ_zlacAAAAJ;PksdgoUAAAAJ;Qzss0GEAAAAJ", "orcid": "0000-0002-7340-8643;;;;;;0000-0002-4907-7354;0000-0002-0127-2425", "linkedin": ";;;;;;;", "or_profile": "~Haoran_He1;~Peilin_Wu3;~Chenjia_Bai2;~Hang_Lai1;~Lingxiao_Wang6;~Ling_Pan1;~Xiaolin_Hu1;~Weinan_Zhang1", "aff": "Hong Kong University of Science and Technology;Shanghai Jiaotong University;Shanghai AI Laboratory;Shanghai Jiaotong University;Northwestern University;Montreal Institute for Learning Algorithms (MILA);Tsinghua University;Shanghai Jiaotong University", "aff_domain": "connect.ust.hk;sjtu.edu.cn;pjlab.org.cn;sjtu.edu.cn;northwestern.edu;mila.umontreal.ca;tsinghua.edu.cn;sjtu.edu.cn", "position": "PhD student;Undergrad student;Researcher;MS student;PhD student;Postdoc;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nhe2024bridging,\ntitle={Bridging the Sim-to-Real Gap from the Information Bottleneck Perspective},\nauthor={Haoran He and Peilin Wu and Chenjia Bai and Hang Lai and Lingxiao Wang and Ling Pan and Xiaolin Hu and Weinan Zhang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Bq4XOaU4sV}\n}", "github": "https://github.com/tinnerhrhe/HIB_Policy", "project": "", "reviewers": "jvAp;HU57;jExp;iHnC", "site": "https://openreview.net/forum?id=Bq4XOaU4sV", "pdf_size": 0, "rating": "3;3;3;4", "confidence": "4;3;3;3", "rating_avg": 3.25, "confidence_avg": 3.25, "replies_avg": 6, "authors#_avg": 8, "corr_rating_confidence": -0.3333333333333333, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18287284482282754574&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;1;3;4;5;1", "aff_unique_norm": "Hong Kong University of Science and Technology;Shanghai Jiao Tong University;Shanghai AI Laboratory;Northwestern University;Montreal Institute for Learning Algorithms;Tsinghua University", "aff_unique_dep": ";;;;Artificial Intelligence;", "aff_unique_url": "https://www.ust.hk;https://www.sjtu.edu.cn;https://www.shanghai-ai-lab.com;https://www.northwestern.edu;https://mila.quebec;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HKUST;SJTU;SAIL;NU;MILA;THU", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Hong Kong SAR;;Montreal", "aff_country_unique_index": "0;0;0;0;1;2;0;0", "aff_country_unique": "China;United States;Canada" }, { "id": "CPQW5kc0pe", "title": "VoxAct-B: Voxel-Based Acting and Stabilizing Policy for Bimanual Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Bimanual manipulation is critical to many robotics applications. In contrast to single-arm manipulation, bimanual manipulation tasks are challenging due to higher-dimensional action spaces. Prior works leverage large amounts of data and primitive actions to address this problem, but may suffer from sample inefficiency and limited generalization across various tasks. To this end, we propose VoxAct-B, a language-conditioned, voxel-based method that leverages Vision Language Models (VLMs) to prioritize key regions within the scene and reconstruct a voxel grid. We provide this voxel grid to our bimanual manipulation policy to learn acting and stabilizing actions. This approach enables more efficient policy learning from voxels and is generalizable to different tasks. In simulation, we show that VoxAct-B outperforms strong baselines on fine-grained bimanual manipulation tasks. Furthermore, we demonstrate VoxAct-B on real-world $\\texttt{Open Drawer}$ and $\\texttt{Open Jar}$ tasks using two UR5s. Code, data, and videos are available at https://voxact-b.github.io.", "keywords": "bimanual manipulation;voxel representation;vision language models", "primary_area": "", "supplementary_material": "/attachment/427d8fa501769ad4bc853f23e5b35d7c08446f54.zip", "author": "I-Chun Arthur Liu;Sicheng He;Daniel Seita;Gaurav S. Sukhatme", "authorids": "~I-Chun_Arthur_Liu1;sichengh@usc.edu;~Daniel_Seita1;~Gaurav_S._Sukhatme1", "gender": "M;;;M", "homepage": "http://arthurliu.com/;;;http://www-robotics.usc.edu/~gaurav/", "dblp": ";;;s/GauravSSukhatme", "google_scholar": "ToWC_fgAAAAJ;;;https://scholar.google.com.tw/citations?user=lRUi-A8AAAAJ", "orcid": "0000-0001-7144-634X;;;0000-0003-2408-474X", "linkedin": "i-chun-arthur-liu/;;;gaurav-sukhatme-9b6420b/", "or_profile": "~I-Chun_Arthur_Liu1;sichengh@usc.edu;~Daniel_Seita1;~Gaurav_S._Sukhatme1", "aff": "University of Southern California;;;University of Southern California", "aff_domain": "usc.edu;;;usc.edu", "position": "PhD student;;;Full Professor", "bibtex": "@inproceedings{\nliu2024voxactb,\ntitle={VoxAct-B: Voxel-Based Acting and Stabilizing Policy for Bimanual Manipulation},\nauthor={I-Chun Arthur Liu and Sicheng He and Daniel Seita and Gaurav S. Sukhatme},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CPQW5kc0pe}\n}", "github": "https://github.com/VoxAct-B/voxactb", "project": "", "reviewers": "6SQW;nMKS;7y5C", "site": "https://openreview.net/forum?id=CPQW5kc0pe", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;4", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=225131447719718662&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "CpXiqz6qf4", "title": "SonicSense: Object Perception from In-Hand Acoustic Vibration", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce SonicSense, a holistic design of hardware and software to enable rich robot object perception through in-hand acoustic vibration sensing. While previous studies have shown promising results with acoustic sensing for object perception, current solutions are constrained to a handful of objects with simple geometries and homogeneous materials, single-finger sensing, and mixing training and testing on the same objects. SonicSense enables container inventory status differentiation, heterogeneous material prediction, 3D shape reconstruction, and object re-identification from a diverse set of 83 real-world objects. Our system employs a simple but effective heuristic exploration policy to interact with the objects as well as end-to-end learning-based algorithms to fuse vibration signals to infer object properties. Our framework underscores the significance of in-hand acoustic vibration sensing in advancing robot tactile perception.", "keywords": "Tactile Perception;Object State Estimation;Audio;Acoustic Vibration Sensing", "primary_area": "", "supplementary_material": "/attachment/6eeb9cdd25bc9b946697bdc758af1f2a33684a3b.zip", "author": "Jiaxun Liu;Boyuan Chen", "authorids": "~Jiaxun_Liu1;~Boyuan_Chen1", "gender": ";Not Specified", "homepage": "https://www.jiaxunliu.com/;http://boyuanchen.com/", "dblp": ";193/7174-1", "google_scholar": ";5DBpY6EAAAAJ", "orcid": ";", "linkedin": "jiaxun-liu-5b26b6242?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;boyuan-chen-b30854a0/", "or_profile": "~Jiaxun_Liu1;~Boyuan_Chen1", "aff": "Duke University;Duke University", "aff_domain": "duke.edu;duke.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024sonicsense,\ntitle={SonicSense: Object Perception from In-Hand Acoustic Vibration},\nauthor={Jiaxun Liu and Boyuan Chen},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CpXiqz6qf4}\n}", "github": "https://github.com/generalroboticslab/SonicSense?tab=readme-ov-file", "project": "", "reviewers": "UVNB;M19m;bAXR", "site": "https://openreview.net/forum?id=CpXiqz6qf4", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 2, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14811108450190524227&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "CskuWHDBAr", "title": "Enhancing Visual Domain Robustness in Behaviour Cloning via Saliency-Guided Augmentation", "track": "main", "status": "Poster", "tldr": "", "abstract": "In vision-based behaviour cloning (BC), traditional image-level augmentation methods such as pixel shifting enhance in-domain performance but often struggle with visual domain shifts, including distractors, occlusion, and changes in lighting and backgrounds. Conversely, superimposition-based augmentation, proven effective in computer vision, improves model generalisability by blending training images and out-of-domain images. Despite its potential, the applicability of these methods to vision-based BC remains unclear due to the unique challenges posed by BC demonstrations; specifically, preserving task-critical scene semantics, spatial-temporal relationships, and agent-target interactions is crucial. To address this, we introduce RoboSaGA, a context-aware approach that dynamically adjusts augmentation intensity per pixel based on input saliency derived from the policy. This method ensures aggressive augmentation within task-trivial areas without compromising task-critical information. Furthermore, RoboSaGA seamlessly integrates into existing network architectures without the need for structural changes or additional learning objectives. Our empirical evaluations across both simulated and real-world settings demonstrate that RoboSaGA not only maintains in-domain performance but significantly improves resilience to distractors and background variations.", "keywords": "Behaviour Cloning;Visuomotor Policy;Data Augmentation", "primary_area": "", "supplementary_material": "/attachment/56c99d32165c314ff713bbbdb143d6f305a78a7a.zip", "author": "Zheyu Zhuang;RUIYU WANG;Nils Ingelhag;Ville Kyrki;Danica Kragic", "authorids": "~Zheyu_Zhuang1;~RUIYU_WANG1;ingelhag@kth.se;~Ville_Kyrki1;~Danica_Kragic1", "gender": "M;F;;;F", "homepage": ";https://ruiyuwang.github.io;;https://irobotics.aalto.fi;http://www.csc.kth.se/~danik", "dblp": ";;;07/2806;82/1211", "google_scholar": "https://scholar.google.se/citations?user=zHFrndgAAAAJ;YoTCQe4AAAAJ;;8OBnyXQAAAAJ;", "orcid": ";0009-0008-7672-970X;;;", "linkedin": ";ruiyu-wang-8ba225208;;;", "or_profile": "~Zheyu_Zhuang1;~RUIYU_WANG1;ingelhag@kth.se;~Ville_Kyrki1;~Danica_Kragic1", "aff": "KTH Royal Institute of Technology;KTH Royal Institute of Technology;;Aalto University;KTH", "aff_domain": "kth.se;kth.se;;aalto.fi;kth.se", "position": "Postdoc;PhD student;;Full Professor;Professor", "bibtex": "@inproceedings{\nzhuang2024enhancing,\ntitle={Enhancing Visual Domain Robustness in Behaviour Cloning via Saliency-Guided Augmentation},\nauthor={Zheyu Zhuang and RUIYU WANG and Nils Ingelhag and Ville Kyrki and Danica Kragic},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=CskuWHDBAr}\n}", "github": "https://github.com/Zheyu-Zhuang/RoboSaGA", "project": "", "reviewers": "wTGw;Nf11;Kp3m", "site": "https://openreview.net/forum?id=CskuWHDBAr", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;3", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=585691244095487426&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "KTH Royal Institute of Technology;Aalto University", "aff_unique_dep": ";", "aff_unique_url": "https://www.kth.se;https://www.aalto.fi", "aff_unique_abbr": "KTH;Aalto", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0", "aff_country_unique": "Sweden;Finland" }, { "id": "Czs2xH9114", "title": "WoCoCo: Learning Whole-Body Humanoid Control with Sequential Contacts", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humanoid activities involving sequential contacts are crucial for complex robotic interactions and operations in the real world and are traditionally solved by model-based motion planning, which is time-consuming and often relies on simplified dynamics models. \nAlthough model-free reinforcement learning (RL) has become a powerful tool for versatile and robust whole-body humanoid control, \nit still requires tedious task-specific tuning and state machine design and suffers from long-horizon exploration issues in tasks involving contact sequences. In this work, we propose WoCoCo (Whole-Body Control with Sequential Contacts), a unified framework to learn whole-body humanoid control with sequential contacts by naturally decomposing the tasks into separate contact stages. Such decomposition facilitates simple and general policy learning pipelines through task-agnostic reward and sim-to-real designs, requiring only one or two task-related terms to be specified for each task. We demonstrated that end-to-end RL-based controllers trained with WoCoCo enable four challenging whole-body humanoid tasks involving diverse contact sequences in the real world without any motion priors: 1) versatile parkour jumping, 2) box loco-manipulation, 3) dynamic clap-and-tap dancing, and 4) cliffside climbing. We further show that WoCoCo is a general framework beyond humanoid by applying it in 22-DoF dinosaur robot loco-manipulation tasks. Website: lecar-lab.github.io/wococo/.", "keywords": "Whole-Body Humanoid Control;Multi-Contact Control;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/26af738e231863416a08745358a94cd70f2f2688.zip", "author": "Chong Zhang;Wenli Xiao;Tairan He;Guanya Shi", "authorids": "~Chong_Zhang6;~Wenli_Xiao1;~Tairan_He1;~Guanya_Shi1", "gender": "Not Specified;M;M;M", "homepage": "https://zita-ch.github.io/;https://wenlixiao-cs.github.io/;https://tairanhe.com;http://guanyashi.github.io", "dblp": ";;263/2891.html;230/4386", "google_scholar": ";https://scholar.google.com/citations?hl=en;TVWH2U8AAAAJ;joR1Z4UAAAAJ", "orcid": ";;;0000-0002-9075-3705", "linkedin": ";wenli-xiao/;tairan-he-41a904294/;guanya-shi-b07b43126/", "or_profile": "~Chong_Zhang6;~Wenli_Xiao1;~Tairan_He1;~Guanya_Shi1", "aff": "ETHZ - ETH Zurich;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "ethz.ch;cmu.edu;andrew.cmu.edu;andrew.cmu.edu", "position": "MS student;MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024wococo,\ntitle={WoCoCo: Learning Whole-Body Humanoid Control with Sequential Contacts},\nauthor={Chong Zhang and Wenli Xiao and Tairan He and Guanya Shi},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Czs2xH9114}\n}", "github": "", "project": "", "reviewers": "MQ1w;HtJA;PP8V", "site": "https://openreview.net/forum?id=Czs2xH9114", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;5;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 45, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9293766414578731737&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "ETH Zurich;Carnegie Mellon University", "aff_unique_dep": ";", "aff_unique_url": "https://www.ethz.ch;https://www.cmu.edu", "aff_unique_abbr": "ETHZ;CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Switzerland;United States" }, { "id": "DDIoRSh8ID", "title": "Multi-Task Interactive Robot Fleet Learning with Visual World Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent advancements in large-scale multi-task robot learning offer the potential for deploying robot fleets in household and industrial settings, enabling them to perform diverse tasks across various environments. However, AI-enabled robots often face challenges with generalization and robustness when exposed to real-world variability and uncertainty. We introduce Sirius-Fleet, a multi-task interactive robot fleet learning framework to address these challenges. Sirius-Fleet monitors robot performance during deployment and involves humans to correct the robot's actions when necessary. We employ a visual world model to predict the outcomes of future actions and build anomaly predictors to predict whether they will likely result in anomalies. As the robot autonomy improves, the anomaly predictors automatically adapt their prediction criteria, leading to fewer requests for human intervention and gradually reducing human workload over time. Evaluations on large-scale benchmarks demonstrate Sirius-Fleet's effectiveness in improving multi-task policy performance and monitoring accuracy. We demonstrate Sirius-Fleet's performance in both RoboCasa in simulation and Mutex in the real world, two diverse, large-scale multi-task benchmarks. More information is available on the project website: https://ut-austin-rpl.github.io/sirius-fleet", "keywords": "Robot Manipulation;Interactive Imitation Learning;Fleet Learning", "primary_area": "", "supplementary_material": "/attachment/cb816467b676b7b6f52ffe44ab9cc088670ce3c8.zip", "author": "Huihan Liu;Yu Zhang;Vaarij Betala;Evan Zhang;James Liu;Crystal Ding;Yuke Zhu", "authorids": "~Huihan_Liu1;~Yu_Zhang77;vaarijbetala@gmail.com;evanczhang@utexas.edu;liujames2003@gmail.com;crystald@utexas.edu;~Yuke_Zhu1", "gender": ";M;;;;;M", "homepage": ";https://franklin-zhang0.github.io/;;;;;https://cs.utexas.edu/~yukez/", "dblp": ";;;;;;133/1772", "google_scholar": ";GZeoy0oAAAAJ;;;;;mWGyYMsAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Huihan_Liu1;~Yu_Zhang77;vaarijbetala@gmail.com;evanczhang@utexas.edu;liujames2003@gmail.com;crystald@utexas.edu;~Yuke_Zhu1", "aff": ";ShanghaiTech University;;;;;Computer Science Department, University of Texas, Austin", "aff_domain": ";shanghaitech.edu.cn;;;;;cs.utexas.edu", "position": ";Undergrad student;;;;;Assistant Professor", "bibtex": "@inproceedings{\nliu2024multitask,\ntitle={Multi-Task Interactive Robot Fleet Learning with Visual World Models},\nauthor={Huihan Liu and Yu Zhang and Vaarij Betala and Evan Zhang and James Liu and Crystal Ding and Yuke Zhu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DDIoRSh8ID}\n}", "github": "", "project": "", "reviewers": "eiin;9gCY;sGwF", "site": "https://openreview.net/forum?id=DDIoRSh8ID", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1297220880250611910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "ShanghaiTech University;University of Texas at Austin", "aff_unique_dep": ";Computer Science Department", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "ShanghaiTech;UT Austin", "aff_campus_unique_index": "1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1", "aff_country_unique": "China;United States" }, { "id": "DSdAEsEGhE", "title": "SoloParkour: Constrained Reinforcement Learning for Visual Locomotion from Privileged Experience", "track": "main", "status": "Poster", "tldr": "", "abstract": "Parkour poses a significant challenge for legged robots, requiring navigation through complex environments with agility and precision based on limited sensory inputs.\nIn this work, we introduce a novel method for training end-to-end visual policies, from depth pixels to robot control commands, to achieve agile and safe quadruped locomotion.\nWe formulate robot parkour as a constrained reinforcement learning (RL) problem designed to maximize the emergence of agile skills within the robot's physical limits while ensuring safety. \nWe first train a policy without vision using privileged information about the robot's surroundings. \nWe then generate experience from this privileged policy to warm-start a sample efficient off-policy RL algorithm from depth images.\nThis allows the robot to adapt behaviors from this privileged experience to visual locomotion while circumventing the high computational costs of RL directly from pixels.\nWe demonstrate the effectiveness of our method on a real Solo-12 robot, showcasing its capability to perform a variety of parkour skills such as walking, climbing, leaping, and crawling.", "keywords": "Reinforcement Learning;Agile Locomotion;Visuomotor Control", "primary_area": "", "supplementary_material": "/attachment/0269c46878e3b60f16499f0eb6593587c8314cbe.zip", "author": "Elliot Chane-Sane;Joseph Amigo;Thomas Flayols;Ludovic Righetti;Nicolas Mansard", "authorids": "~Elliot_Chane-Sane1;~Joseph_Amigo1;thomas.flayols@laas.fr;~Ludovic_Righetti1;~Nicolas_Mansard1", "gender": "M;M;;M;Unspecified", "homepage": ";;;https://engineering.nyu.edu/faculty/ludovic-righetti;https://gepettoweb.laas.fr/index.php/Members/NicolasMansard", "dblp": ";;;;90/5900", "google_scholar": "ejHZv20AAAAJ;;;LuA1j4oAAAAJ;rq-9xAkAAAAJ", "orcid": ";;;0000-0002-6458-9112;", "linkedin": "https://fr.linkedin.com/in/elliot-chane-sane;joseph-amigo-bb876a174;;;", "or_profile": "~Elliot_Chane-Sane1;~Joseph_Amigo1;thomas.flayols@laas.fr;~Ludovic_Righetti1;~Nicolas_Mansard1", "aff": "LAAS / CNRS;New York University;;Max-Planck Institute;LAAS / CNRS", "aff_domain": "laas.fr;nyu.edu;;mpg.de;laas.fr", "position": "Postdoc;PhD student;;Research Group Leader;Researcher", "bibtex": "@inproceedings{\nchane-sane2024soloparkour,\ntitle={SoloParkour: Constrained Reinforcement Learning for Visual Locomotion from Privileged Experience},\nauthor={Elliot Chane-Sane and Joseph Amigo and Thomas Flayols and Ludovic Righetti and Nicolas Mansard},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DSdAEsEGhE}\n}", "github": "https://github.com/Gepetto/SoloParkour", "project": "", "reviewers": "1Wye;vns5;uDaM", "site": "https://openreview.net/forum?id=DSdAEsEGhE", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;2;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9007076211682506738&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 12, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "LAAS;New York University;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.", "aff_unique_dep": ";;", "aff_unique_url": "https://www.laas.fr/;https://www.nyu.edu;https://www.mpg.de", "aff_unique_abbr": "LAAS;NYU;MPG", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;2;0", "aff_country_unique": "France;United States;Germany" }, { "id": "Dftu4r5jHe", "title": "Context-Aware Replanning with Pre-Explored Semantic Map for Object Navigation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Pre-explored Semantic Map, constructed through prior exploration using visual language models (VLMs), has proven effective as a foundational element for training-free robotic applications. However, existing approaches assume the map's accuracy and do not provide effective mechanisms for revising decisions based on incorrect maps. This work introduces Context-Aware Replanning (CARe),, which estimates map uncertainty through confidence scores and multi-view consistency, enabling the agent to revise erroneous decisions stemming from inaccurate maps without additional labels. We demonstrate the effectiveness of our proposed method using two modern map backbones, VLMaps and OpenMask3D, and show significant improvements in performance on object navigation tasks.", "keywords": "VLMs;map;navigation;uncertainty;multi-view consistency;robotics", "primary_area": "", "supplementary_material": "/attachment/86229662b408ce96326c0d19f4087568b7242b0a.zip", "author": "Po-Chen Ko;Hung-Ting Su;CY Chen;Jia-Fong Yeh;Min Sun;Winston H. Hsu", "authorids": "~Po-Chen_Ko1;~Hung-Ting_Su1;~CY_Chen1;~Jia-Fong_Yeh1;~Min_Sun1;~Winston_H._Hsu2", "gender": "M;M;;M;M;M", "homepage": ";;https://www.cmlab.csie.ntu.edu.tw/~jiafongyeh/;http://aliensunmin.github.io;;https://winstonhsu.info/", "dblp": "230/2143;;198/7831;62/2750-1;359/0626;16/5668.html", "google_scholar": "5oNVau8AAAAJ;;kS-oZ20AAAAJ;1Rf6sGcAAAAJ;https://scholar.google.com.tw/citations?user=LihTgFUAAAAJ;https://scholar.google.com.tw/citations?user=NOvDH3QAAAAJ", "orcid": ";;;;;0000-0002-3330-0638", "linkedin": ";jeffreychen-tw;;;%E6%9F%8F%E8%BE%B0-%E6%9F%AF-1405b427a/edit/forms/contact-info/new/#:~:text=https%3A//www.linkedin.com/in/%25E6%259F%258F%25E8%25BE%25B0%2D%25E6%259F%25AF%2D1405b427a;", "or_profile": "~Hung-Ting_Su1;~CY_Chen1;~Jia-Fong_Yeh1;~Min_Sun1;~Pochen_Ko1;~Winston_Hsu1", "aff": "National Taiwan University;National Taiwan University;Sony Group Corporation;National Tsing Hua University;National Taiwan University;National Taiwan University", "aff_domain": "ntu.edu.tw;ntu.edu.tw;sony.com;nthu.edu.tw;ntu.edu.tw;ntu.edu.tw", "position": "Postdoc;MS student;Intern;Assistant Professor;Undergrad student;Professor", "bibtex": "@inproceedings{\nko2024contextaware,\ntitle={Context-Aware Replanning with Pre-Explored Semantic Map for Object Navigation},\nauthor={Po-Chen Ko and Hung-Ting Su and CY Chen and Jia-Fong Yeh and Min Sun and Winston H. Hsu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Dftu4r5jHe}\n}", "github": "https://github.com/CARe-maps/CARe_experiments", "project": "", "reviewers": "2uSj;f959;ojnQ", "site": "https://openreview.net/forum?id=Dftu4r5jHe", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:f8ziYE-b3F8J:scholar.google.com/&scioq=Context-Aware+Replanning+with+Pre-Explored+Semantic+Map+for+Object+Navigation&hl=en&as_sdt=0,5", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "National Taiwan University;Sony Group Corporation;National Tsing Hua University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ntu.edu.tw;https://www.sony.com;https://www.nthu.edu.tw", "aff_unique_abbr": "NTU;Sony;NTHU", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Taiwan;", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;Japan" }, { "id": "DsFQg0G4Xu", "title": "Learning Long-Horizon Action Dependencies in Sampling-Based Bilevel Planning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Autonomous robots will need the ability to make task and motion plans that involve long sequences of actions, e.g. to prepare a meal. One challenge is that the feasibility of actions late in the plan may depend on much earlier actions. This issue is exacerbated if these dependencies exist at a purely geometric level, making them difficult to express for a task planner. Backtracking is a common technique to resolve such geometric dependencies, but its time complexity limits its applicability to short-horizon dependencies. We propose an approach to account for these dependencies by learning a search heuristic for task and motion planning. We evaluate our approach on five quasi-static simulated domains and show a substantial improvement in success rate over the baselines.", "keywords": "task and motion planning;long-horizon;learning for planning", "primary_area": "", "supplementary_material": "/attachment/81268d0dd8623d5564743231ce7299759ae3f6fc.zip", "author": "Bart\u0142omiej Cie\u015blar;Leslie Pack Kaelbling;Tom\u00e1s Lozano-P\u00e9rez;Jorge Mendez-Mendez", "authorids": "~Bart\u0142omiej_Cie\u015blar1;~Leslie_Pack_Kaelbling1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Jorge_Mendez-Mendez1", "gender": ";F;M;M", "homepage": "https://bcieslar.pl/;http://people.csail.mit.edu/lpk/;http://people.csail.mit.edu/tlp/;https://www.seas.upenn.edu/~mendezme/", "dblp": ";k/LesliePackKaelbling;90/752;255/6609", "google_scholar": ";IcasIiwAAAAJ;gQOKAggAAAAJ;87sQtnsAAAAJ", "orcid": ";0000-0001-6054-7145;;0000-0002-2537-598X", "linkedin": "bcieslar;;;", "or_profile": "~Bart\u0142omiej_Cie\u015blar1;~Leslie_Pack_Kaelbling1;~Tom\u00e1s_Lozano-P\u00e9rez1;~Jorge_Armando_Mendez_Mendez1", "aff": "Imperial College London;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "imperial.ac.uk;mit.edu;mit.edu;mit.edu", "position": "MS student;Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\ncie{\\'s}lar2024learning,\ntitle={Learning Long-Horizon Action Dependencies in Sampling-Based Bilevel Planning},\nauthor={Bart{\\l}omiej Cie{\\'s}lar and Leslie Pack Kaelbling and Tom{\\'a}s Lozano-P{\\'e}rez and Jorge Mendez-Mendez},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=DsFQg0G4Xu}\n}", "github": "", "project": "", "reviewers": "5rxt;dUuw;KAhV", "site": "https://openreview.net/forum?id=DsFQg0G4Xu", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kuDO36crm3UJ:scholar.google.com/&scioq=Learning+Long-Horizon+Action+Dependencies+in+Sampling-Based+Bilevel+Planning&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Imperial College London;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.imperial.ac.uk;https://web.mit.edu", "aff_unique_abbr": "ICL;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "E4K3yLQQ7s", "title": "Visual Manipulation with Legs", "track": "main", "status": "Poster", "tldr": "", "abstract": "Animals have the ability to use their arms and legs for both locomotion and manipulation. We envision quadruped robots to have the same versatility. This work presents a system that empowers a quadruped robot to perform object interactions with its legs, drawing inspiration from non-prehensile manipulation techniques. The proposed system has two main components: a visual manipulation policy module and a loco-manipulator module. The visual manipulation policy module decides how the leg should interact with the object, trained with reinforcement learning (RL) with point cloud observations and object-centric actions. The loco-manipulator controller controls the leg movements and body pose adjustments, implemented based on impedance control and Model Predictive Control (MPC). Besides manipulating objects with a single leg, the proposed system can also select from left or right legs based on the critic maps and move the object to distant goals through robot base adjustment. In the experiments, we evaluate the proposed system with the object pose alignment tasks both in simulation and in the real world, demonstrating object manipulation skills with legs more versatile than previous work.", "keywords": "Legged robots;Non-prehensile manipulation;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/cfa0a8db38bc2292c06f82fbee78773b63e6af15.zip", "author": "Xialin He;Chengjing Yuan;Wenxuan Zhou;Ruihan Yang;David Held;Xiaolong Wang", "authorids": "~Xialin_He1;~Chengjing_Yuan1;~Wenxuan_Zhou1;~Ruihan_Yang2;~David_Held1;~Xiaolong_Wang3", "gender": "M;;F;M;M;M", "homepage": "https://xialin-he.github.io/;https://www.yuanchengjing.com;https://wenxuan-zhou.github.io/;http://rchalyang.github.io/;http://davheld.github.io/;https://xiaolonw.github.io/", "dblp": ";;;;22/11147;91/952-4", "google_scholar": "-oy5DaIAAAAJ;;picvdvEAAAAJ;b-o1o7cAAAAJ;0QtU-NsAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Xialin_He1;~Chengjing_Yuan1;~Wenxuan_Zhou1;~Ruihan_Yang2;~David_Held1;~Xiaolong_Wang3", "aff": "Shanghai Jiaotong University;University of California, San Diego;Carnegie Mellon University;University of California, San Diego;Carnegie Mellon University;University of California, San Diego", "aff_domain": "sjtu.edu.cn;ucsd.edu;andrew.cmu.edu;ucsd.edu;cmu.edu;ucsd.edu", "position": "Undergrad student;MS student;PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nhe2024visual,\ntitle={Visual Manipulation with Legs},\nauthor={Xialin He and Chengjing Yuan and Wenxuan Zhou and Ruihan Yang and David Held and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=E4K3yLQQ7s}\n}", "github": "", "project": "", "reviewers": "VjUa;xy5P;AEx8", "site": "https://openreview.net/forum?id=E4K3yLQQ7s", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16213207668870294043&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1;2;1", "aff_unique_norm": "Shanghai Jiao Tong University;University of California, San Diego;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.ucsd.edu;https://www.cmu.edu", "aff_unique_abbr": "SJTU;UCSD;CMU", "aff_campus_unique_index": "1;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "EM0wndCeoD", "title": "BiGym: A Demo-Driven Mobile Bi-Manual Manipulation Benchmark", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce BiGym, a new benchmark and learning environment for mobile bi-manual demo-driven robotic manipulation. BiGym features 40 diverse tasks set in home environments, ranging from simple target reaching to complex kitchen cleaning. To capture the real-world performance accurately, we provide human-collected demonstrations for each task, reflecting the diverse modalities found in real-world robot trajectories. BiGym supports a variety of observations, including proprioceptive data and visual inputs such as RGB, and depth from 3 camera views. To validate the usability of BiGym, we thoroughly benchmark the state-of-the-art imitation learning algorithms and demo-driven reinforcement learning algorithms within the environment and discuss the future opportunities.", "keywords": "Bi-Manual Manipulation;Mobile Manipulation;Benchmark", "primary_area": "", "supplementary_material": "/attachment/ef800c417e1cc31af464ffd97da90d232bbdde62.zip", "author": "Nikita Chernyadev;Nicholas Backshall;Xiao Ma;Yunfan Lu;Younggyo Seo;Stephen James", "authorids": "~Nikita_Chernyadev1;~Nicholas_Backshall1;~Xiao_Ma2;~Yunfan_Lu2;~Younggyo_Seo1;~Stephen_James1", "gender": "M;M;M;M;M;M", "homepage": ";https://www.linkedin.com/in/nicholas-backshall;https://yusufma03.github.io/;;https://younggyo.me/;https://stepjam.github.io/", "dblp": ";;35/573-6;;265/5586;163/5669", "google_scholar": "p18sazwAAAAJ;;hR4G6hoAAAAJ;HKg5U1MAAAAJ;tI1-YwIAAAAJ;OXtG-isAAAAJ", "orcid": ";;;;;", "linkedin": "nikita-chernyadev-8495417a/;;;;;", "or_profile": "~Nikita_Chernyadev1;~Nicholas_Backshall1;~Xiao_Ma2;~Yunfan_Lu2;~Younggyo_Seo1;~Stephen_James1", "aff": "Dyson;;Dyson Robot Learning Lab;;Dyson;Dyson", "aff_domain": "dyson.co.uk;;dyson.com;;dyson.com;dyson.com", "position": "Researcher;;Research Scientist;;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nchernyadev2024bigym,\ntitle={BiGym: A Demo-Driven Mobile Bi-Manual Manipulation Benchmark},\nauthor={Nikita Chernyadev and Nicholas Backshall and Xiao Ma and Yunfan Lu and Younggyo Seo and Stephen James},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EM0wndCeoD}\n}", "github": "https://github.com/chernyadev/bigym", "project": "", "reviewers": "bdfR;JNyU;RTpp;oipX", "site": "https://openreview.net/forum?id=EM0wndCeoD", "pdf_size": 0, "rating": "3;3;3;3", "confidence": "4;4;4;3", "rating_avg": 3.0, "confidence_avg": 3.75, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8695586512268244471&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Dyson", "aff_unique_dep": "", "aff_unique_url": "https://www.dyson.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "EPujQZWemk", "title": "ViPER: Visibility-based Pursuit-Evasion via Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "In visibility-based pursuit-evasion tasks, a team of mobile pursuer robots with limited sensing capabilities is tasked with detecting all evaders in a multiply-connected planar environment, whose map may or may not be known to pursuers beforehand. This requires tight coordination among multiple agents to ensure that the omniscient and potentially arbitrarily fast evaders are guaranteed to be detected by the pursuers. Whereas existing methods typically rely on a relatively large team of agents to clear the environment, we propose ViPER, a neural solution that leverages a graph attention network to learn a coordinated yet distributed policy via multi-agent reinforcement learning (MARL). We experimentally demonstrate that ViPER significantly outperforms other state-of-the-art non-learning planners, showcasing its emergent coordinated behaviors and adaptability to more challenging scenarios and various team sizes, and finally deploy its learned policies on hardware in an aerial search task.", "keywords": "MARL;pursuit-evasion;graph attention;path planning", "primary_area": "", "supplementary_material": "/attachment/99f55436f004f020a7bec156bd26ec9910e51c77.zip", "author": "Yizhuo Wang;Yuhong Cao;Jimmy Chiun;Subhadeep Koley;Mandy Pham;Guillaume Adrien Sartoretti", "authorids": "~Yizhuo_Wang1;~Yuhong_Cao1;~Jimmy_Chiun1;~Subhadeep_Koley2;~Mandy_Pham1;~Guillaume_Adrien_Sartoretti1", "gender": "M;M;M;M;;M", "homepage": "https://github.com/wyzh98;;;https://subhadeepk.github.io/;;https://marmotlab.org/", "dblp": ";;;;;118/9066", "google_scholar": ";;;;;n7NzZ0sAAAAJ", "orcid": ";0000-0001-8099-0689;0009-0009-5184-8291;;;0000-0002-7579-9916", "linkedin": ";;jimmychiun;;phamandy24/;", "or_profile": "~Yizhuo_Wang1;~Yuhong_Cao1;~Jimmy_Chiun1;~Subhadeep_Koley2;~Mandy_Pham1;~Guillaume_Adrien_Sartoretti1", "aff": "National University of Singapore;National University of Singapore;;Indian Institute of Engineering Science and Technology, Shibpur;University of California, Berkeley;National University of Singapore", "aff_domain": "u.nus.edu;u.nus.edu;;iiests.ac.in;berkeley.edu;nus.edu.sg", "position": "PhD student;PhD student;;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024viper,\ntitle={Vi{PER}: Visibility-based Pursuit-Evasion via Reinforcement Learning},\nauthor={Yizhuo Wang and Yuhong Cao and Jimmy Chiun and Subhadeep Koley and Mandy Pham and Guillaume Adrien Sartoretti},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EPujQZWemk}\n}", "github": "https://github.com/marmotlab/ViPER", "project": "", "reviewers": "ycfG;Z6Zd;fEXc", "site": "https://openreview.net/forum?id=EPujQZWemk", "pdf_size": 0, "rating": "3;3;3", "confidence": "2;5;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14939283146011177854&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "National University of Singapore;Indian Institute of Engineering Science and Technology;University of California, Berkeley", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.iiests.ac.in;https://www.berkeley.edu", "aff_unique_abbr": "NUS;IIEST Shibpur;UC Berkeley", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Shibpur;Berkeley", "aff_country_unique_index": "0;0;1;2;0", "aff_country_unique": "Singapore;India;United States" }, { "id": "EdVNB2kHv1", "title": "Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "A central challenge towards developing robots that can relate human language to their perception and actions is the scarcity of natural language annotations in diverse robot datasets. Moreover, robot policies that follow natural language instructions are typically trained on either templated language or expensive human-labeled instructions, hindering their scalability. \nTo this end, we introduce NILS: Natural language Instruction Labeling for Scalability. NILS automatically labels uncurated, long-horizon robot data at scale in a zero-shot manner without any human intervention.\nNILS combines pre-trained vision-language foundation models in a sophisticated, carefully considered manner in order to detect objects in a scene, detect object-centric changes, segment tasks from \nlarge datasets of unlabelled interaction data and ultimately label behavior datasets.\nEvaluations on BridgeV2 and a kitchen play dataset show that NILS is able to autonomously annotate diverse robot demonstrations of unlabeled and unstructured datasets, while alleviating several shortcomings of crowdsourced human annotations.", "keywords": "Foundation Models;Language-conditioned Imitation Learning;Data Labeling", "primary_area": "", "supplementary_material": "/attachment/4513a3d803d715c9e852fcfe0317963801b60391.zip", "author": "Nils Blank;Moritz Reuss;Marcel R\u00fchle;\u00d6mer Erdin\u00e7 Ya\u011fmurlu;Fabian Wenzel;Oier Mees;Rudolf Lioutikov", "authorids": "~Nils_Blank1;~Moritz_Reuss1;~Marcel_R\u00fchle1;~\u00d6mer_Erdin\u00e7_Ya\u011fmurlu1;~Fabian_Wenzel1;~Oier_Mees1;~Rudolf_Lioutikov1", "gender": ";M;M;M;M;M;M", "homepage": "https://www.kit.edu/index.php;;;https://github.com/omeryagmurlu;https://www.google.de;https://www.oiermees.com/;https://rudolf.intuitive-robots.net", "dblp": ";321/1769;;;;190/8659;151/9451", "google_scholar": ";NLuzkPIAAAAJ;;https://scholar.google.com/citations?hl=en;;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;hvjV43MAAAAJ", "orcid": ";;;;;;", "linkedin": ";;https://de.linkedin.com/in/marcel-r%C3%BChle-1348b3262?trk=public_profile_samename-profile;;;oier-mees-a3069488;rudolf-lioutikov-74830730a/", "or_profile": "~Nils_Blank1;~Moritz_Reuss1;~Marcel_R\u00fchle1;~\u00d6mer_Erdin\u00e7_Ya\u011fmurlu1;~Fabian_Wenzel1;~Oier_Mees1;~Rudolf_Lioutikov1", "aff": "Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Electrical Engineering & Computer Science Department, University of California, Berkeley;Karlsruher Institut f\u00fcr Technologie", "aff_domain": "kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;eecs.berkeley.edu;kit.edu", "position": "PhD student;PhD student;MS student;MS student;MS student;Postdoc;Tenure-Track Professor", "bibtex": "@inproceedings{\nblank2024scaling,\ntitle={Scaling Robot Policy Learning via Zero-Shot Labeling with Foundation Models},\nauthor={Nils Blank and Moritz Reuss and Marcel R{\\\"u}hle and {\\\"O}mer Erdin{\\c{c}} Ya{\\u{g}}murlu and Fabian Wenzel and Oier Mees and Rudolf Lioutikov},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EdVNB2kHv1}\n}", "github": "", "project": "", "reviewers": "p3J4;B6Tb;PVKm", "site": "https://openreview.net/forum?id=EdVNB2kHv1", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16886430710532721694&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;University of California, Berkeley", "aff_unique_dep": ";Electrical Engineering & Computer Science Department", "aff_unique_url": "https://www.kit.edu;https://www.berkeley.edu", "aff_unique_abbr": "KIT;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "Germany;United States" }, { "id": "EifoVoIyd5", "title": "What Matters in Range View 3D Object Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "Lidar-based perception pipelines rely on 3D object detection models to interpret complex scenes. While multiple representations for lidar exist, the range view is enticing since it losslessly encodes the entire lidar sensor output. In this work, we achieve state-of-the-art amongst range view 3D object detection models without using multiple techniques proposed in past range view literature. We explore range view 3D object detection across two modern datasets with substantially different properties: Argoverse 2 and Waymo Open. Our investigation reveals key insights: (1) input feature dimensionality significantly influences the overall performance, (2) surprisingly, employing a classification loss grounded in 3D spatial proximity works as well or better compared to more elaborate IoU-based losses, and (3) addressing non-uniform lidar density via a straightforward range subsampling technique outperforms existing multi-resolution, range-conditioned networks. Our experiments reveal that techniques proposed in recent range view literature are not needed to achieve state-of-the-art performance. Combining the above findings, we establish a new state-of-the-art model for range view 3D object detection \u2014 improving AP by 2.2% on the Waymo Open dataset while maintaining a runtime of 10 Hz. We are the first to benchmark a range view model on the Argoverse 2 dataset and outperform strong voxel-based baselines. All models are multi-class and open-source. Code is available at https://github.com/benjaminrwilson/range-view-3d-detection.", "keywords": "3D Object Detection;3D Perception;Autonomous Driving", "primary_area": "", "supplementary_material": "/attachment/568eaf5319ce1d2f8304bf2bcc3119b07b6e45c2.zip", "author": "Benjamin Wilson;Nicholas Autio Mitchell;Jhony Kaesemodel Pontes;James Hays", "authorids": "~Benjamin_Wilson3;~Nicholas_Autio_Mitchell1;~Jhony_Kaesemodel_Pontes1;~James_Hays1", "gender": "M;M;M;M", "homepage": "https://www.benjaminrwilson.com;;https://www.jhonykaesemodel.com;http://www.cc.gatech.edu/~hays/", "dblp": ";;176/8340;57/5958", "google_scholar": "mg9VHJIAAAAJ;;gOfBGxsAAAAJ;vjZrDKQAAAAJ", "orcid": ";;;0000-0001-7016-4252", "linkedin": ";nicholas-w-mitchell/;jhonykaesemodel/;james-h-hays/", "or_profile": "~Benjamin_Wilson3;~Nicholas_Autio_Mitchell1;~Jhony_Kaesemodel_Pontes1;~James_Hays1", "aff": "Georgia Institute of Technology;NVIDIA;Latitude AI;Georgia Institute of Technology", "aff_domain": "gatech.edu;nvidia.com;lat.ai;gatech.edu", "position": "PhD student;Researcher;Researcher;Associate professor", "bibtex": "@inproceedings{\nwilson2024what,\ntitle={What Matters in Range View 3D Object Detection},\nauthor={Benjamin Wilson and Nicholas Autio Mitchell and Jhony Kaesemodel Pontes and James Hays},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EifoVoIyd5}\n}", "github": "https://github.com/benjaminrwilson/range-view-3d-detection", "project": "", "reviewers": "sbqX;2jY8;SCbK", "site": "https://openreview.net/forum?id=EifoVoIyd5", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4998324142530933197&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA;Latitude AI", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com;https://www.latitude.ai", "aff_unique_abbr": "Georgia Tech;NVIDIA;Latitude AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "EiqQEsOMZt", "title": "TaMMa: Target-driven Multi-subscene Mobile Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "For everyday service robotics, the ability to navigate back and forth based on tasks in multi-subscene environments and perform delicate manipulations is crucial and highly practical.\nWhile existing robotics primarily focus on complex tasks within a single scene or simple tasks across scalable scenes individually, robots consisting of a mobile base with a robotic arm face the challenge of efficiently representing multiple subscenes, coordinating the collaboration between the mobile base and the robotic arm, and managing delicate tasks in scalable environments.\nTo address this issue, we propose Target-driven Multi-subscene Mobile Manipulation (\\textit{TaMMa}), which efficiently handles mobile base movement and fine-grained manipulation across subscenes. Specifically, we obtain a reliable 3D Gaussian initialization of the whole scene using a sparse 3D point cloud with encoded semantics. Through querying the coarse Gaussians, we acquire the approximate pose of the target, navigate the mobile base to approach it, and reduce the scope of precise target pose estimation to the corresponding subscene. Optimizing while moving, we employ diffusion-based depth completion to optimize fine-grained Gaussians and estimate the target's refined pose. For target-driven manipulation, we adopt Gaussians inpainting to obtain precise poses for the origin and destination of the operation in a \\textit{think before you do it} manner, enabling fine-grained manipulation. \nWe conduct various experiments on a real robotic to demonstrate our method in effectively and efficiently achieving precise operation tasks across multiple tabletop subscenes.", "keywords": "Multi-subscene;3D Gaussians;Scene Inpainting;Target-driven Mobile Manipulation", "primary_area": "", "supplementary_material": "/attachment/60e7ed38f298bf0e450a843c6685db707b9fbdb8.zip", "author": "Jiawei Hou;Tianyu Wang;Tongying Pan;Shouyan Wang;Xiangyang Xue;Yanwei Fu", "authorids": "~Jiawei_Hou2;~Tianyu_Wang10;~Tongying_Pan1;~Shouyan_Wang1;~Xiangyang_Xue2;~Yanwei_Fu2", "gender": "M;M;F;M;M;M", "homepage": "https://github.com/jarvishou829;https://github.com/Star-UU-Wang;https://github.com/KamiPan0601;https://istbi.fudan.edu.cn/info/1774/4601.htm;http://homepage.fudan.edu.cn//xyxue;http://yanweifu.github.io", "dblp": ";;;;84/3791;63/9065", "google_scholar": ";;;;;https://scholar.google.co.uk/citations?user=Vg54TcsAAAAJ", "orcid": "0000-0001-5830-6510;;;;0000-0002-4897-9209;0000-0002-6595-6893", "linkedin": ";;;;;", "or_profile": "~Jiawei_Hou2;~Tianyu_Wang10;~Tongying_Pan1;~Shouyan_Wang1;~Xiangyang_Xue2;~Yanwei_Fu2", "aff": "Fudan University;Fudan University;Fudan University;Fudan University;Fudan University;Fudan University,", "aff_domain": "fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn;fudan.edu.cn", "position": "PhD student;PhD student;MS student;Full Professor;Full Professor;Professor", "bibtex": "@inproceedings{\nhou2024tamma,\ntitle={Ta{MM}a: Target-driven Multi-subscene Mobile Manipulation},\nauthor={Jiawei Hou and Tianyu Wang and Tongying Pan and Shouyan Wang and Xiangyang Xue and Yanwei Fu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EiqQEsOMZt}\n}", "github": "", "project": "", "reviewers": "j6RG;pWYS;GbuM", "site": "https://openreview.net/forum?id=EiqQEsOMZt", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5961046273467126545&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Fudan University", "aff_unique_dep": "", "aff_unique_url": "https://www.fudan.edu.cn", "aff_unique_abbr": "Fudan", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "EyEE7547vy", "title": "Event3DGS: Event-Based 3D Gaussian Splatting for High-Speed Robot Egomotion", "track": "main", "status": "Poster", "tldr": "", "abstract": "By combining differentiable rendering with explicit point-based scene representations, 3D Gaussian Splatting (3DGS) has demonstrated breakthrough 3D reconstruction capabilities. \n However, to date 3DGS has had limited impact on robotics, where high-speed egomotion is pervasive: Egomotion introduces motion blur and leads to artifacts in existing frame-based 3DGS reconstruction methods. \n To address this challenge, we introduce Event3DGS, an event-based 3DGS framework.\n By exploiting the exceptional temporal resolution of event cameras, Event3GDS can reconstruct high-fidelity 3D structure and appearance under high-speed egomotion. \n Extensive experiments on multiple synthetic and real-world datasets demonstrate the superiority of Event3DGS compared with existing event-based dense 3D scene reconstruction frameworks; Event3DGS substantially improves reconstruction quality (+3dB) while reducing computational costs by 95\\%. \n Our framework also allows one to incorporate a few motion-blurred frame-based measurements into the reconstruction process to further improve appearance fidelity without loss of structural accuracy.", "keywords": "Event-based 3D Reconstruction;Gaussian Splatting;High-speed Robot Egomotion", "primary_area": "", "supplementary_material": "/attachment/2c1c3cee6c58ad804c1e8a0993cee6752515e4d0.zip", "author": "Tianyi Xiong;Jiayi Wu;Botao He;Cornelia Fermuller;Yiannis Aloimonos;Heng Huang;Christopher Metzler", "authorids": "~Tianyi_Xiong1;~Jiayi_Wu9;~Botao_He1;~Cornelia_Fermuller3;~Yiannis_Aloimonos1;~Heng_Huang1;~Christopher_Metzler1", "gender": "M;M;M;F;M;M;M", "homepage": "https://tyxiong23.github.io/;https://jiayi-wu-leo.github.io/;https://bottle101.github.io/;http://users.umiacs.umd.edu/users/fer/;http://www.prg.cs.umd.edu;https://www.cs.umd.edu/~heng/;https://www.cs.umd.edu/~metzler/", "dblp": ";;;f/CorneliaFermuller;a/YiannisAloimonos;03/281;147/4828", "google_scholar": "dv0NRZgAAAAJ;xoZE1GsAAAAJ;9ytS6o8AAAAJ;0gEOJSEAAAAJ;https://scholar.google.com/citations?hl=en;4OqLaDwAAAAJ;on7GFpYAAAAJ", "orcid": ";0009-0004-6601-4963;;0000-0003-2044-2386;;;", "linkedin": "tianyi-xiong-52389924a/;jiayi-wu-leo/;;cornelia-fermuller-594b855/;yiannis-aloimonos-6374865/;;", "or_profile": "~Tianyi_Xiong1;~Jiayi_Wu9;~Botao_He1;~Cornelia_Fermuller3;~Yiannis_Aloimonos1;~Heng_Huang1;~Christopher_Metzler1", "aff": "University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;University of Maryland, College Park;Department of Computer Science, University of Maryland, College Park;University of Maryland, College Park", "aff_domain": "umd.edu;umd.edu;umd.edu;umd.edu;umd.edu;cs.umd.edu;umd.edu", "position": "PhD student;PhD student;PhD student;Research Scientist;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxiong2024eventdgs,\ntitle={Event3{DGS}: Event-Based 3D Gaussian Splatting for High-Speed Robot Egomotion},\nauthor={Tianyi Xiong and Jiayi Wu and Botao He and Cornelia Fermuller and Yiannis Aloimonos and Heng Huang and Christopher Metzler},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=EyEE7547vy}\n}", "github": "", "project": "", "reviewers": "a756;AkQ7;VHGX", "site": "https://openreview.net/forum?id=EyEE7547vy", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5955756988940327642&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "University of Maryland;University of Maryland, College Park", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www/umd.edu;https://www/umd.edu", "aff_unique_abbr": "UMD;UMD", "aff_campus_unique_index": "0;0;0;0;0;0;0", "aff_campus_unique": "College Park", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "F0rWEID2gb", "title": "Environment Curriculum Generation via Large Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent work has demonstrated that a promising strategy for teaching robots a wide range of complex skills is by training them on a curriculum of progressively more challenging environments. However, developing an effective curriculum of environment distributions currently requires significant expertise, which must be repeated for every new domain. Our key insight is that environments are often naturally represented as code. Thus, we probe whether effective environment curriculum design can be achieved and automated via code generation by large language models (LLM). In this paper, we introduce Eurekaverse, an unsupervised environment design algorithm that uses LLMs to sample progressively more challenging, diverse, and learnable environments for skill training. We validate Eurekaverse's effectiveness in the domain of quadrupedal parkour learning, in which a quadruped robot must traverse through a variety of obstacle courses. The automatic curriculum designed by Eurekaverse enables gradual learning of complex parkour skills in simulation and can successfully transfer to the real-world, outperforming manual training courses designed by humans.", "keywords": "Large Language Models;Environment Curriculum;Quadrupeds;Sim-To-Real Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/cb85ec72431fd55308a983d21177363303635c1a.zip", "author": "William Liang;Sam Wang;Hung-Ju Wang;Osbert Bastani;Dinesh Jayaraman;Yecheng Jason Ma", "authorids": "~William_Liang1;~Sam_Wang1;~Hung-Ju_Wang1;~Osbert_Bastani1;~Dinesh_Jayaraman2;~Yecheng_Jason_Ma2", "gender": "M;;M;M;M;", "homepage": "https://www.seas.upenn.edu/~wjhliang/;https://samuelwang23.github.io/;;http://obastani.github.io;https://www.seas.upenn.edu/~dineshj/;", "dblp": ";;;21/11275;145/3870;", "google_scholar": ";;;cxYepGkAAAAJ;QxLpghAAAAAJ;", "orcid": ";;;;0000-0002-6888-3095;", "linkedin": ";sam-wang-penn/;hungju-wang-5a5124172/;;dinesh-jayaraman-44b31539/;", "or_profile": "~William_Liang1;~Sam_Wang1;~Hung-Ju_Wang1;~Osbert_Bastani1;~Dinesh_Jayaraman2;~Yecheng_Jason_Ma2", "aff": "University of Pennsylvania;University of Pennsylvania;;University of Pennsylvania;University of Pennsylvania;", "aff_domain": "upenn.edu;seas.upenn.edu;;upenn.edu;upenn.edu;", "position": "Undergrad student;Undergrad student;;Assistant Professor;Assistant Professor;", "bibtex": "@inproceedings{\nliang2024environment,\ntitle={Environment Curriculum Generation via Large Language Models},\nauthor={William Liang and Sam Wang and Hung-Ju Wang and Osbert Bastani and Dinesh Jayaraman and Yecheng Jason Ma},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=F0rWEID2gb}\n}", "github": "", "project": "", "reviewers": "XMA4;LPUg;VvEd", "site": "https://openreview.net/forum?id=F0rWEID2gb", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=19514856782024333&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "FHnVRmeqxf", "title": "FlowRetrieval: Flow-Guided Data Retrieval for Few-Shot Imitation Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Imitation learning policies in robotics tend to require an extensive amount of demonstrations. It is critical to develop few-shot adaptation strategies that rely only on a small amount of task-specific human demonstrations. Prior works focus on learning general policies from large scale dataset with diverse behaviors. Recent research has shown that directly retrieving relevant past experiences to augment policy learning has great promise in few-shot settings. However, existing data retrieval methods fall under two extremes: they either rely on the existence of exact same behaviors with visually similar scenes in the prior data, which is impractical to assume; or they retrieve based on semantic similarity of high-level language descriptions of the task, which might not be that informative about the shared behaviors or motions across tasks. In this work, we investigate how we can leverage motion similarity in the vast amount of cross-task data to improve few-shot imitation learning of the target task. Our key insight is that motion-similar data carry rich information about the effects of actions and object interactions that can be leveraged during few-shot adaptation. We propose FlowRetrieval, an approach that leverages optical flow representations for both extracting similar motions to target tasks from prior data, and for guiding learning of a policy that can maximally benefit from such data. Our results show FlowRetrieval significantly outperforms prior methods across simulated and real-world domains, achieving on average 27% higher success rate than the best retrieval-based prior method. In the Pen-in-Cup task with a real Franka Emika robot, FlowRetrieval achieves 3.7x the performance of the baseline learning from all prior and target data.", "keywords": "Data Retrieval;Few-shot Learning;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/05242c5d54d3fd9a1e9b5301afb8130136f1648f.zip", "author": "Li-Heng Lin;Yuchen Cui;Amber Xie;Tianyu Hua;Dorsa Sadigh", "authorids": "~Li-Heng_Lin1;~Yuchen_Cui1;~Amber_Xie1;~Tianyu_Hua1;~Dorsa_Sadigh1", "gender": "M;F;;;F", "homepage": "https://lihenglin.github.io;https://yuchencui.cc;;;https://dorsa.fyi/", "dblp": ";201/5416.html;;;117/3174", "google_scholar": "https://scholar.google.com/citations?hl=en;qQz2cm8AAAAJ;https://scholar.google.com/citations?hl=en;;ZaJEZpYAAAAJ", "orcid": ";0000-0001-7417-1222;;;", "linkedin": ";;;;", "or_profile": "~Li-Heng_Lin1;~Yuchen_Cui1;~Amber_Xie1;~Tianyu_Hua1;~Dorsa_Sadigh1", "aff": "Stanford University;Stanford University;;;Stanford University", "aff_domain": "stanford.edu;stanford.edu;;;stanford.edu", "position": "MS student;Postdoc;;;Assistant Professor", "bibtex": "@inproceedings{\nlin2024flowretrieval,\ntitle={FlowRetrieval: Flow-Guided Data Retrieval for Few-Shot Imitation Learning},\nauthor={Li-Heng Lin and Yuchen Cui and Amber Xie and Tianyu Hua and Dorsa Sadigh},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FHnVRmeqxf}\n}", "github": "", "project": "", "reviewers": "Z3WV;JhhG;GJ3C", "site": "https://openreview.net/forum?id=FHnVRmeqxf", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1653547116664315&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "FO6tePGRZj", "title": "Mobile ALOHA: Learning Bimanual Mobile Manipulation using Low-Cost Whole-Body Teleoperation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Imitation learning from human demonstrations has shown impressive performance in robotics. However, most results focus on table-top manipulation, lacking the mobility and dexterity necessary for generally useful tasks. In this work, we develop a system for imitating mobile manipulation tasks that are bimanual and require whole-body control. We first present Mobile ALOHA, a low-cost and whole-body teleoperation system for data collection. It augments the ALOHA system with a mobile base, and a whole-body teleoperation interface. Using data collected with Mobile ALOHA, we then perform supervised behavior cloning and find that co-training with existing static ALOHA datasets boosts performance on mobile manipulation tasks. With 50 demonstrations for each task, co-training can increase success rates by up to 90\\%, allowing Mobile ALOHA to autonomously complete complex mobile manipulation tasks such as sauteing and serving a piece of shrimp, opening a two-door wall cabinet to store heavy cooking pots, calling and entering an elevator, and lightly rinsing a used pan using a kitchen faucet. We will open-source all the hardware and software implementations upon publication.", "keywords": "Mobile Manipulation;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/6c76dd385b64d19e161cc7a0f414f19554086e46.zip", "author": "Zipeng Fu;Tony Z. Zhao;Chelsea Finn", "authorids": "~Zipeng_Fu1;~Tony_Z._Zhao1;~Chelsea_Finn1", "gender": "M;;F", "homepage": "https://zipengfu.github.io;https://tonyzhaozh.github.io/;https://ai.stanford.edu/~cbfinn/", "dblp": "245/1504;;131/1783", "google_scholar": "wMcPTbEAAAAJ;;vfPE6hgAAAAJ", "orcid": ";;", "linkedin": "zipengfu;;", "or_profile": "~Zipeng_Fu1;~Tony_Z._Zhao1;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Google", "aff_domain": "stanford.edu;stanford.edu;google.com", "position": "PhD student;PhD student;Research Scientist", "bibtex": "@inproceedings{\nfu2024mobile,\ntitle={Mobile {ALOHA}: Learning Bimanual Mobile Manipulation using Low-Cost Whole-Body Teleoperation},\nauthor={Zipeng Fu and Tony Z. Zhao and Chelsea Finn},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=FO6tePGRZj}\n}", "github": "https://github.com/MarkFzp/mobile-aloha", "project": "", "reviewers": "JTSs;31qG;68bJ", "site": "https://openreview.net/forum?id=FO6tePGRZj", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1678059520765572021&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "G0jqGG8Tta", "title": "Not All Errors Are Made Equal: A Regret Metric for Detecting System-level Trajectory Prediction Failures", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robot decision-making increasingly relies on data-driven human prediction models when operating around people. While these models are known to mispredict in out-of-distribution interactions, only a subset of prediction errors impact downstream robot performance. \nWe propose characterizing such ``system-level'' prediction failures via the mathematical notion of regret: high-regret interactions are precisely those in which mispredictions degraded closed-loop robot performance. \nWe further introduce a probabilistic generalization of regret that calibrates failure detection across disparate deployment contexts and renders regret compatible with reward-based and reward-free (e.g., generative) planners. \nIn simulated autonomous driving interactions, we showcase that our system-level failure metric can automatically mine for closed-loop human-robot interactions that state-of-the-art generative human predictors and robot planners struggle with. \nWe further find that the very presence of high-regret data during human predictor fine-tuning is highly predictive of robot re-deployment performance improvements. \nFurthermore, fine-tuning with the informative but significantly smaller high-regret data (23% of deployment data) is competitive with fine-tuning on the full deployment dataset, indicating a promising avenue for efficiently mitigating system-level human-robot interaction failures.", "keywords": "Human-Robot Interaction;Trajectory Prediction;Failure Detection", "primary_area": "", "supplementary_material": "/attachment/40493bfe36615a0c66601547f9c9350623ff6802.zip", "author": "Kensuke Nakamura;Thomas Tian;Andrea Bajcsy", "authorids": "~Kensuke_Nakamura1;~Thomas_Tian1;~Andrea_Bajcsy1", "gender": ";M;F", "homepage": ";https://scholar.google.com/citations?user=uY4D8-wAAAAJ&hl=en&authuser=1;https://www.cs.cmu.edu/~abajcsy/", "dblp": ";;208/0997", "google_scholar": "https://scholar.google.ca/citations?hl=en;;LUe32ToAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Kensuke_Nakamura1;~Thomas_Tian1;~Andrea_Victoria_Bajcsy1", "aff": "Carnegie Mellon University;University of California, Berkeley;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;berkeley.edu;cmu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nnakamura2024not,\ntitle={Not All Errors Are Made Equal: A Regret Metric for Detecting System-level Trajectory Prediction Failures},\nauthor={Kensuke Nakamura and Thomas Tian and Andrea Bajcsy},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G0jqGG8Tta}\n}", "github": "", "project": "", "reviewers": "ehM7;HApx;t3uw", "site": "https://openreview.net/forum?id=G0jqGG8Tta", "pdf_size": 0, "rating": "1;2;4", "confidence": "4;3;4", "rating_avg": 2.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.18898223650461363, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6247034828810691407&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.berkeley.edu", "aff_unique_abbr": "CMU;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "G8UcwxNAoD", "title": "Teaching Robots with Show and Tell: Using Foundation Models to Synthesize Robot Policies from Language and Visual Demonstration", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce a modular, neuro-symbolic framework for teaching robots new skills through language and visual demonstration. Our approach, ShowTell, composes a mixture of foundation models to synthesize robot manipulation programs that are easy to interpret and generalize across a wide range of tasks and environments. ShowTell is designed to handle complex demonstrations involving high level logic such as loops and conditionals while being intuitive and natural for end-users. We validate this approach through a series of real-world robot experiments, showing that ShowTell out-performs a state-of-the-art baseline based on GPT4-V, on a variety of tasks, and that it is able to generalize to unseen environments and within category objects.", "keywords": "learning from demonstration;language model planning;neuro-symbolic reasoning", "primary_area": "", "supplementary_material": "", "author": "Michael Murray;Abhishek Gupta;Maya Cakmak", "authorids": "~Michael_Murray2;~Abhishek_Gupta1;~Maya_Cakmak1", "gender": ";M;F", "homepage": "https://mmurray.com;https://homes.cs.washington.edu/~abhgupta/;https://homes.cs.washington.edu/~mcakmak/", "dblp": ";18/6404-4;65/6092", "google_scholar": ";1wLVDP4AAAAJ;https://scholar.google.com.tw/citations?user=sPlonWcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Michael_Murray2;~Abhishek_Gupta1;~Maya_\u00c7akmak1", "aff": "University of Washington;University of Washington;University of Washington, Seattle", "aff_domain": "washington.edu;uw.edu;uw.edu", "position": "PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nmurray2024teaching,\ntitle={Teaching Robots with Show and Tell: Using Foundation Models to Synthesize Robot Policies from Language and Visual Demonstration},\nauthor={Michael Murray and Abhishek Gupta and Maya Cakmak},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=G8UcwxNAoD}\n}", "github": "", "project": "", "reviewers": "2euh;F4Hd;sbTj", "site": "https://openreview.net/forum?id=G8UcwxNAoD", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8380763067808598752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Washington", "aff_unique_dep": "", "aff_unique_url": "https://www.washington.edu", "aff_unique_abbr": "UW", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "GGuNkjQSrk", "title": "Action Space Design in Reinforcement Learning for Robot Motor Skills", "track": "main", "status": "Poster", "tldr": "", "abstract": "Practitioners often rely on intuition to select action spaces for learning. The choice can substantially impact final performance even when choosing among configuration-space representations such as joint position, velocity, and torque commands. We examine action space selection considering a wheeled-legged robot, a quadruped robot, and a simulated suite of locomotion, manipulation, and control tasks. \nWe analyze the mechanisms by which action space can improve performance and conclude that the action space can influence learning performance substantially in a task-dependent way. Moreover, we find that much of the practical impact of action space selection on learning dynamics can be explained by improved policy initialization and behavior between timesteps.", "keywords": "Reinforcement Learning;Action Spaces;Sim-to-Real", "primary_area": "", "supplementary_material": "/attachment/372e91ffd517e3ea1305387ff9ef939f4265e773.zip", "author": "Julian E\u00dfer;Gabriel B. Margolis;Oliver Urbann;S\u00f6ren Kerner;Pulkit Agrawal", "authorids": "~Julian_E\u00dfer1;~Gabriel_B._Margolis1;~Oliver_Urbann1;soeren.kerner@iml.fraunhofer.de;~Pulkit_Agrawal1", "gender": "M;;M;;M", "homepage": ";;;;https://people.eecs.berkeley.edu/~pulkitag/", "dblp": ";;;;149/2672", "google_scholar": "Ih6n0p4AAAAJ;;https://scholar.google.com/citations?hl=en;;UpZmJI0AAAAJ", "orcid": ";;;;", "linkedin": "julian-esser/;;;;", "or_profile": "~Julian_E\u00dfer1;~Gabriel_B._Margolis1;~Oliver_Urbann1;soeren.kerner@iml.fraunhofer.de;~Pulkit_Agrawal1", "aff": "Fraunhofer IML;;Fraunhofer IML;;Massachusetts Institute of Technology", "aff_domain": "fraunhofer.de;;fraunhofer.de;;mit.edu", "position": "Researcher;;Postdoc;;Assistant Professor", "bibtex": "@inproceedings{\ne{\\ss}er2024action,\ntitle={Action Space Design in Reinforcement Learning for Robot Motor Skills},\nauthor={Julian E{\\ss}er and Gabriel B. Margolis and Oliver Urbann and S{\\\"o}ren Kerner and Pulkit Agrawal},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GGuNkjQSrk}\n}", "github": "", "project": "", "reviewers": "15bq;29AA;FMNG", "site": "https://openreview.net/forum?id=GGuNkjQSrk", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15738840836706970193&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "Fraunhofer Institute for Material Flow and Logistics;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.iml.fraunhofer.de/;https://web.mit.edu", "aff_unique_abbr": "Fraunhofer IML;MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1", "aff_country_unique": "Germany;United States" }, { "id": "GVX6jpZOhU", "title": "RoboPoint: A Vision-Language Model for Spatial Affordance Prediction in Robotics", "track": "main", "status": "Poster", "tldr": "", "abstract": "From rearranging objects on a table to putting groceries into shelves, robots must plan precise action points to perform tasks accurately and reliably. In spite of the recent adoption of vision language models (VLMs) to control robot behavior, VLMs struggle to precisely articulate robot actions using language. We introduce an automatic synthetic data generation pipeline that instruction-tunes VLMs to robotic domains and needs. Using the pipeline, we train RoboPoint, a VLM that predicts image keypoint affordances given language instructions. Compared to alternative approaches, our method requires no real-world data collection or human demonstration, making it much more scalable to diverse environments and viewpoints. In addition, RoboPoint is a general model that enables several downstream applications such as robot navigation, manipulation, and augmented reality (AR) assistance. Our experiments demonstrate that RoboPoint outperforms state-of-the-art VLMs (GPT-4o) and visual prompting techniques (PIVOT) by 21.8% in the accuracy of predicting spatial affordance and by 30.5% in the success rate of downstream tasks. Anonymous project page: https://robopoint.github.io.", "keywords": "Foundation Model;Affordance Prediction;Open-world Manipulation", "primary_area": "", "supplementary_material": "/attachment/b3e1c4b237f0187ed29cd45e22bf0745d72775fb.zip", "author": "Wentao Yuan;Jiafei Duan;Valts Blukis;Wilbert Pumacay;Ranjay Krishna;Adithyavairavan Murali;Arsalan Mousavian;Dieter Fox", "authorids": "~Wentao_Yuan1;~Jiafei_Duan1;~Valts_Blukis1;~Wilbert_Pumacay1;~Ranjay_Krishna1;~Adithyavairavan_Murali2;~Arsalan_Mousavian1;~Dieter_Fox1", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://wentaoyuan.github.io;https://duanjiafei.com/;;https://wpumacay.github.io;http://ranjaykrishna.com;http://adithyamurali.com;https://cs.gmu.edu/~amousavi/;https://homes.cs.washington.edu/~fox/", "dblp": "225/4795.html;275/9973.html;210/9692;;167/3785;;164/8572;f/DieterFox", "google_scholar": "PZZZG6YAAAAJ;d1WCSJIAAAAJ;i9-GzNYAAAAJ;;IcqahyAAAAAJ;Tjj8TZAAAAAJ;fcA9m88AAAAJ;DqXsbPAAAAAJ", "orcid": "0000-0002-3836-8877;;;;0000-0001-8784-2531;;;", "linkedin": ";jiafei-duan-a69b11112/;valtsblukis/;;ranjay-krishna-1a344444/;adithyamurali;;", "or_profile": "~Wentao_Yuan1;~Jiafei_Duan1;~Valts_Blukis1;~Wilbert_Pumacay1;~Ranjay_Krishna1;~Adithyavairavan_Murali2;~Arsalan_Mousavian1;~Dieter_Fox1", "aff": "University of Washington, Seattle;NVIDIA;NVIDIA;Universidad Nacional de Ingenier\u00eda;University of Washington;;NVIDIA;Department of Computer Science", "aff_domain": "uw.edu;nvidia.com;nvidia.com;uni.edu.pe;cs.washington.edu;;nvidia.com;cs.washington.edu", "position": "PhD student;Intern;Researcher;Researcher;Assistant Professor;;Research Scientist;Full Professor", "bibtex": "@inproceedings{\nyuan2024robopoint,\ntitle={RoboPoint: A Vision-Language Model for Spatial Affordance Prediction in Robotics},\nauthor={Wentao Yuan and Jiafei Duan and Valts Blukis and Wilbert Pumacay and Ranjay Krishna and Adithyavairavan Murali and Arsalan Mousavian and Dieter Fox},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=GVX6jpZOhU}\n}", "github": "https://github.com/wentaoyuan/RoboPoint", "project": "", "reviewers": "px3Y;Wr3v;zjRR", "site": "https://openreview.net/forum?id=GVX6jpZOhU", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;5", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 1.0, "gs_citation": 53, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2186558977018690360&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;0;1;3", "aff_unique_norm": "University of Washington;NVIDIA;Universidad Nacional de Ingenier\u00eda;Unknown Institution", "aff_unique_dep": ";NVIDIA Corporation;;Department of Computer Science", "aff_unique_url": "https://www.washington.edu;https://www.nvidia.com;https://www.uni.edu.pe;", "aff_unique_abbr": "UW;NVIDIA;UNI;", "aff_campus_unique_index": "0", "aff_campus_unique": "Seattle;", "aff_country_unique_index": "0;0;0;1;0;0", "aff_country_unique": "United States;Peru;" }, { "id": "HlxRd529nG", "title": "Detect Everything with Few Examples", "track": "main", "status": "Poster", "tldr": "", "abstract": "Few-shot object detection aims at detecting novel categories given only a few example images. It is a basic skill for a robot to perform tasks in open environments. Recent methods focus on finetuning strategies, with complicated procedures that prohibit a wider application. In this paper, we introduce DE-ViT, a few-shot object detector without the need for finetuning. DE-ViT's novel architecture is based on a new region-propagation mechanism for localization. The propagated region masks are transformed into bounding boxes through a learnable spatial integral layer. Instead of training prototype classifiers, we propose to use prototypes to project ViT features into a subspace that is robust to overfitting on base classes. We evaluate DE-ViT on few-shot, and one-shot object detection benchmarks with Pascal VOC, COCO, and LVIS. DE-ViT establishes new state-of-the-art results on all benchmarks. Notably, for COCO, DE-ViT surpasses the few-shot SoTA by 15 mAP on 10-shot and 7.2 mAP on 30-shot and one-shot SoTA by 2.8 AP50. For LVIS, DE-ViT outperforms few-shot SoTA by 17 box APr. Further, we evaluate DE-ViT with a real robot by building a pick-and-place system for sorting novel objects based on example images. The videos of our robot demonstrations, the source code and the models of DE-ViT can be found at https://mlzxy.github.io/devit.", "keywords": "Robot Vision;Object Detection and Recognition;Few-shot Learning", "primary_area": "", "supplementary_material": "/attachment/ed7deebd1406ef0bd4419b83702ff473bf85f0c1.zip", "author": "Xinyu Zhang;Yuhan Liu;Yuting Wang;Abdeslam Boularias", "authorids": "~Xinyu_Zhang7;~Yuhan_Liu2;~Yuting_Wang2;~Abdeslam_Boularias1", "gender": "M;M;;M", "homepage": "https://mlzxy.github.io/;;;http://rl.cs.rutgers.edu/", "dblp": ";125/8141;09/8269-4;57/2269", "google_scholar": "M7hnG9oAAAAJ;https://scholar.google.com/citations?hl=en;o9V5WAYAAAAJ;https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xinyu_Zhang7;~Yuhan_Liu2;~Yuting_Wang2;~Abdeslam_Boularias1", "aff": "Rutgers University;Rutgers University;Amazon;, Rutgers University", "aff_domain": "rutgers.edu;rutgers.edu;amazon.com;cs.rutgers.edu", "position": "PhD student;PhD student;Researcher;Associate Professor", "bibtex": "@inproceedings{\nzhang2024detect,\ntitle={Detect Everything with Few Examples},\nauthor={Xinyu Zhang and Yuhan Liu and Yuting Wang and Abdeslam Boularias},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=HlxRd529nG}\n}", "github": "http://github.com/mlzxy/devit", "project": "", "reviewers": "dSpH;dhu9;3v8u", "site": "https://openreview.net/forum?id=HlxRd529nG", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2956108232415386259&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Rutgers University;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.rutgers.edu;https://www.amazon.com", "aff_unique_abbr": "Rutgers;Amazon", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "IcOrwlXzMi", "title": "VLM-Grounder: A VLM Agent for Zero-Shot 3D Visual Grounding", "track": "main", "status": "Poster", "tldr": "", "abstract": "3D visual grounding is crucial for robots, requiring integration of natural language and 3D scene understanding. Traditional methods depend on supervised learning with 3D point clouds are limited by scarce datasets. Recently zero-shot methods leveraging LLMs have been proposed to address the data issue. While effective, these methods often miss detailed scene context, limiting their ability to handle complex queries. In this work, we present VLM-Grounder, a novel framework using vision-language models (VLMs) for zero-shot 3D visual grounding based solely on 2D images. VLM-Grounder dynamically stitches image sequences, employs a grounding and feedback scheme to find the target object, and uses a multi-view ensemble projection to accurately estimate 3D bounding boxes. Experiments on ScanRefer and Nr3D datasets show VLM-Grounder outperforms previous zero-shot methods, achieving 51.6\\% Acc@0.25 on ScanRefer and 48.0\\% Acc on Nr3D, without relying on 3D geometry or object priors.", "keywords": "3D Visual Grounding;VLM Agent;Zero-Shot", "primary_area": "", "supplementary_material": "/attachment/ea028a1f1b660b0558f048d7c9fe2745e0878ec1.zip", "author": "Runsen Xu;Zhiwei Huang;Tai Wang;Yilun Chen;Jiangmiao Pang;Dahua Lin", "authorids": "~Runsen_Xu1;~Zhiwei_Huang4;~Tai_Wang2;~Yilun_Chen1;~Jiangmiao_Pang1;~Dahua_Lin1", "gender": "M;M;M;M;M;M", "homepage": ";https://github.com/huang583824382?tab=repositories;http://yilunchen.com/about/;https://oceanpang.github.io/;http://dahua.site;https://tai-wang.github.io/", "dblp": "289/6916;;;231/7630;53/6088;", "google_scholar": "MOobrCcAAAAJ;;gKXC9Q8AAAAJ;https://scholar.google.com/citations?authuser=0;GMzzRRUAAAAJ;JmbbZWIAAAAJ", "orcid": ";;0000-0003-3372-8703;0000-0002-6711-9319;;", "linkedin": "runsen-xu-4262a3272/;;yilunchen-cuhk/;;;%E6%B3%B0-%E7%8E%8B-2b2738147/", "or_profile": "~Runsen_Xu1;~Zhiwei_Huang4;~Yilun_Chen1;~Jiangmiao_Pang1;~Dahua_Lin1;~Tai_WANG1", "aff": "The Chinese University of Hong Kong;Zhejiang University;Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory ;The Chinese University of Hong Kong;Shanghai AI Laboratory", "aff_domain": "ie.cuhk.edu;zju.edu.cn;pjlab.org.cn;pjlab.org.cn;cuhk.edu.hk;pjlab.org.cn", "position": "PhD student;MS student;Researcher;Research Scientist;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nxu2024vlmgrounder,\ntitle={{VLM}-Grounder: A {VLM} Agent for Zero-Shot 3D Visual Grounding},\nauthor={Runsen Xu and Zhiwei Huang and Tai Wang and Yilun Chen and Jiangmiao Pang and Dahua Lin},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IcOrwlXzMi}\n}", "github": "https://github.com/OpenRobotLab/VLM-Grounder", "project": "", "reviewers": "oE1Z;oazi;mxB2", "site": "https://openreview.net/forum?id=IcOrwlXzMi", "pdf_size": 0, "rating": "2;2;3", "confidence": "3;4;5", "rating_avg": 2.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17955735960020044664&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;0;3", "aff_unique_norm": "Chinese University of Hong Kong;Zhejiang University;Shanghai Artificial Intelligence Laboratory;Shanghai AI Laboratory", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.cuhk.edu.hk;https://www.zju.edu.cn;http://www.shailab.org/;https://www.shanghai-ai-lab.com", "aff_unique_abbr": "CUHK;ZJU;Shanghai AI Lab;SAIL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "InT87E5sr4", "title": "Dreamitate: Real-World Visuomotor Policy Learning via Video Generation", "track": "main", "status": "Poster", "tldr": "", "abstract": "A key challenge in manipulation is learning a policy that can robustly generalize to diverse visual environments. A promising mechanism for learning robust policies is to leverage video generative models, which are pretrained on large-scale datasets of internet videos. In this paper, we propose a visuomotor policy learning framework that fine-tunes a video diffusion model on human demonstrations of a given task. At test time, we generate an example of an execution of the task conditioned on images of a novel scene, and use this synthesized execution directly to control the robot. Our key insight is that using common tools allows us to effortlessly bridge the embodiment gap between the human hand and the robot manipulator. We evaluate our approach on 4 tasks of increasing complexity and demonstrate that capitalizing on internet-scale generative models allows the learned policy to achieve a significantly higher degree of generalization than existing behavior cloning approaches.", "keywords": "Imitation Learning;Visuomotor Policy;Video Generation", "primary_area": "", "supplementary_material": "/attachment/0ee17f0c408f0946d8089691c6a6ed1b555d3cf2.zip", "author": "Junbang Liang;Ruoshi Liu;Ege Ozguroglu;Sruthi Sudhakar;Achal Dave;Pavel Tokmakov;Shuran Song;Carl Vondrick", "authorids": "~Junbang_Liang2;~Ruoshi_Liu2;~Ege_Ozguroglu1;~Sruthi_Sudhakar1;~Achal_Dave1;~Pavel_Tokmakov2;~Shuran_Song3;~Carl_Vondrick2", "gender": "M;M;M;F;M;M;F;M", "homepage": ";https://ruoshiliu.github.io/;;https://sruthisudhakar.github.io/;http://www.achaldave.com/;https://pvtokmakov.github.io/home/;https://shurans.github.io/;http://www.cs.columbia.edu/~vondrick/", "dblp": ";283/4797;;;156/1161;153/2264;;26/8610", "google_scholar": "saAxMFYAAAAJ;suAawHYAAAAJ;585Dh2wAAAAJ;;oQyYH9kAAAAJ;https://scholar.google.fr/citations?user=b15vJuEAAAAJ;https://scholar.google.com/citations?hl=en;3MzhkFIAAAAJ", "orcid": ";;;;;;;", "linkedin": ";ruoshi-liu-a5046aa0/;ege-o-593405125;;;;;", "or_profile": "~Junbang_Liang2;~Ruoshi_Liu2;~Ege_Ozguroglu1;~Sruthi_Sudhakar1;~Achal_Dave1;~Pavel_Tokmakov2;~Shuran_Song3;~Carl_Vondrick2", "aff": "Columbia University;Columbia University;Columbia University;Columbia University;Toyota Research Institute;Toyota Research Institute;Stanford University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;columbia.edu;columbia.edu;tri.global;tri.global;stanford.edu;columbia.edu", "position": "MS student;PhD student;PhD student;PhD student;Researcher;Research Scientist;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliang2024dreamitate,\ntitle={Dreamitate: Real-World Visuomotor Policy Learning via Video Generation},\nauthor={Junbang Liang and Ruoshi Liu and Ege Ozguroglu and Sruthi Sudhakar and Achal Dave and Pavel Tokmakov and Shuran Song and Carl Vondrick},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=InT87E5sr4}\n}", "github": "https://dreamitate.cs.columbia.edu/", "project": "", "reviewers": "MoCz;RxCE;w1Yn;gdSH", "site": "https://openreview.net/forum?id=InT87E5sr4", "pdf_size": 0, "rating": "2;2;3;3", "confidence": "5;3;5;4", "rating_avg": 2.5, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 8, "corr_rating_confidence": 0.30151134457776363, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12897750390693739026&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;1;1;2;0", "aff_unique_norm": "Columbia University;Toyota Research Institute;Stanford University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.columbia.edu;https://www.tri.global;https://www.stanford.edu", "aff_unique_abbr": "Columbia;TRI;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "IsZb0wT3Kw", "title": "ANAVI: Audio Noise Awareness using Visual of Indoor environments for NAVIgation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We propose Audio Noise Awareness using Visuals of Indoors for NAVIgation for quieter robot path planning. While humans are naturally aware of the noise they make and its impact on those around them, robots currently lack this awareness. \nA key challenge in achieving audio awareness for robots is estimating how loud will the robot\u2019s actions be at a listener\u2019s location? Since sound depends upon the geometry and material composition of rooms, we train the robot to passively perceive loudness using visual observations of indoor environments. To this end, we generate data on how loud an `impulse' sounds at different listener locations in simulated homes, and train our Acoustic Noise Predictor (ANP). Next, we collect acoustic profiles corresponding to different actions for navigation. Unifying ANP with action acoustics, we demonstrate experiments with wheeled (Hello Robot Stretch) and legged (Unitree Go2) robots so that these robots adhere to the noise constraints of the environment. All simulated and real-world data, code and model checkpoints is released at https://anavi-corl24.github.io/.", "keywords": "Robots;Acoustic Noise;Vision;Learning", "primary_area": "", "supplementary_material": "/attachment/e7651262222e374585c3632eee7b3d89343801c5.zip", "author": "Vidhi Jain;Rishi Veerapaneni;Yonatan Bisk", "authorids": "~Vidhi_Jain2;~Rishi_Veerapaneni1;~Yonatan_Bisk1", "gender": "F;;M", "homepage": "http://vidhijain.github.io;;http://www.YonatanBisk.com", "dblp": "199/2574;;38/9282", "google_scholar": ";;bWoGh8UAAAAJ", "orcid": ";;0000-0002-2111-9081", "linkedin": "vidhijain96/;;yonatanbisk/", "or_profile": "~Vidhi_Jain2;~Rishi_Veerapaneni1;~Yonatan_Bisk1", "aff": "Google;;Meta", "aff_domain": "google.com;;meta.com", "position": "Student Researcher;;Visiting Professor", "bibtex": "@inproceedings{\njain2024anavi,\ntitle={{ANAVI}: Audio Noise Awareness using Visual of Indoor environments for {NAVI}gation},\nauthor={Vidhi Jain and Rishi Veerapaneni and Yonatan Bisk},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IsZb0wT3Kw}\n}", "github": "https://github.com/vidhiJain/anavi", "project": "", "reviewers": "WfSy;i2im;2dvx;XYtK", "site": "https://openreview.net/forum?id=IsZb0wT3Kw", "pdf_size": 0, "rating": "1;3;3;4", "confidence": "4;3;4;4", "rating_avg": 2.75, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": -0.13245323570650439, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O6MMXBp2c0cJ:scholar.google.com/&scioq=ANAVI:+Audio+Noise+Awareness+using+Visual+of+Indoor+environments+for+NAVIgation&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Google;Meta", "aff_unique_dep": "Google;Meta Platforms, Inc.", "aff_unique_url": "https://www.google.com;https://meta.com", "aff_unique_abbr": "Google;Meta", "aff_campus_unique_index": "0", "aff_campus_unique": "Mountain View;", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "Isp19rFFV4", "title": "Multi-Strategy Deployment-Time Learning and Adaptation for Navigation under Uncertainty", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present an approach for performant point-goal navigation in unfamiliar partially-mapped environments. When deployed, our robot runs multiple strategies for deployment-time learning and visual domain adaptation in parallel and quickly selects the best-performing among them. Choosing between policies as they are learned or adapted between navigation trials requires continually updating estimates of their performance as they evolve. Leveraging recent work in model-based learning-informed planning under uncertainty, we determine lower bounds on the would-be performance of newly-updated policies on old trials without needing to re-deploy them. This information constrains and accelerates bandit-like policy selection, affording quick selection of the best-performing strategy shortly after it would start to yield good performance. We validate the effectiveness of our approach in simulated maze-like environments, showing improved navigation cost and cumulative regret versus existing baselines.", "keywords": "policy selection;domain adaptation;navigation under uncertainty", "primary_area": "", "supplementary_material": "/attachment/cc93d8da0b89fd7a5983dd9c914af1412e67221e.zip", "author": "Abhishek Paudel;Xuesu Xiao;Gregory J. Stein", "authorids": "~Abhishek_Paudel1;~Xuesu_Xiao1;~Gregory_J._Stein1", "gender": "M;M;M", "homepage": "https://abpaudel.com/;https://cs.gmu.edu/~xiao/;http://gjstein.com", "dblp": "299/7821;164/8375.html;207/7717", "google_scholar": "ytDltfUAAAAJ;bWbsbjAAAAAJ;", "orcid": ";;0000-0003-1981-4154", "linkedin": "abpaudel;;", "or_profile": "~Abhishek_Paudel1;~Xuesu_Xiao1;~Gregory_J_Stein1", "aff": "George Mason University;George Mason University;George Mason University", "aff_domain": "gmu.edu;gmu.edu;gmu.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\npaudel2024multistrategy,\ntitle={Multi-Strategy Deployment-Time Learning and Adaptation for Navigation under Uncertainty},\nauthor={Abhishek Paudel and Xuesu Xiao and Gregory J. Stein},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Isp19rFFV4}\n}", "github": "", "project": "", "reviewers": "APKz;rHoK;UHRD", "site": "https://openreview.net/forum?id=Isp19rFFV4", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7032628180555218538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "George Mason University", "aff_unique_dep": "", "aff_unique_url": "https://www.gmu.edu", "aff_unique_abbr": "GMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "IssXUYvVTg", "title": "MaIL: Improving Imitation Learning with Selective State Space Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "This work introduces Mamba Imitation Learning (MaIL), a novel imitation learning (IL) architecture that offers a computationally efficient alternative to state-of-the-art (SoTA) Transformer policies. Transformer-based policies have achieved remarkable results due to their ability in handling human-recorded data with inherently non-Markovian behavior. However, their high performance comes with the drawback of large models that complicate effective training. While state space models (SSMs) have been known for their efficiency, they were not able to match the performance of Transformers. Mamba significantly improves the performance of SSMs and rivals against Transformers, positioning it as an appealing alternative for IL policies. MaIL leverages Mamba as a backbone and introduces a formalism that allows using Mamba in the encoder-decoder structure. This formalism makes it a versatile architecture that can be used as a standalone policy or as part of a more advanced architecture, such as a diffuser in the diffusion process. Extensive evaluations on the LIBERO IL benchmark and three real robot experiments show that MaIL: i) outperforms Transformers in all LIBERO tasks, ii) achieves good performance even with small datasets, iii) is able to effectively process multi-modal sensory inputs, iv) is more robust to input noise compared to Transformers.", "keywords": "Imitation Learning;Sequence Models;Denoising Diffusion Policies", "primary_area": "", "supplementary_material": "/attachment/dcd2dfb8b2d78a28e5f0978a3d7dc1ec03c89a83.zip", "author": "Xiaogang Jia;Qian Wang;Atalay Donat;Bowen Xing;Ge Li;Hongyi Zhou;Onur Celik;Denis Blessing;Rudolf Lioutikov;Gerhard Neumann", "authorids": "~Xiaogang_Jia1;~Qian_Wang34;~Atalay_Donat1;~Bowen_Xing2;~Ge_Li3;~Hongyi_Zhou1;~Onur_Celik1;~Denis_Blessing1;~Rudolf_Lioutikov1;~Gerhard_Neumann2", "gender": "M;M;M;M;M;M;M;M;M;M", "homepage": "https://xiaogangjia.github.io/Personal_Website/;;https://www.ias.informatik.tu-darmstadt.de/Team/AtalayDonat;;;https://hongyizhoucn.github.io/;https://alr.anthropomatik.kit.edu/21_69.php;;https://rudolf.intuitive-robots.net;https://alr.anthropomatik.kit.edu/", "dblp": "23/10777;;;;;;243/5913;219/1435;151/9451;60/4878", "google_scholar": "E7Tja9gAAAAJ;;LG_x9Y8AAAAJ;;;W35-J2sAAAAJ;9jqaTcAAAAAJ;https://scholar.google.de/citations?view_op=list_works;hvjV43MAAAAJ;https://scholar.google.com.tw/citations?user=GL360kMAAAAJ", "orcid": ";;;;;;;;;", "linkedin": ";qian-wang-265810249/;;bowen-xing-397690294?trk=contact-info;geli-bruce/;hongyi-zhou-9413b9242/;;;rudolf-lioutikov-74830730a/;", "or_profile": "~Xiaogang_Jia1;~Qian_Wang34;~Atalay_Donat1;~Bowen_Xing2;~Ge_Li3;~Hongyi_Zhou1;~Onur_Celik1;~Denis_Blessing1;~Rudolf_Lioutikov1;~Gerhard_Neumann1", "aff": "Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruhe Institute of Technology;Karlsruher Institut f\u00fcr Technologie;Karlsruhe Institute of Technology;Karlsruher Institut f\u00fcr Technologie;Karlsruher Institut f\u00fcr Technologie;Karlsruhe Institute of Technology", "aff_domain": "kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;kit.edu;kit.edu", "position": "PhD student;MS student;MS student;MS student;PhD student;PhD student;PhD student;PhD student;Tenure-Track Professor;Full Professor", "bibtex": "@inproceedings{\njia2024mail,\ntitle={Ma{IL}: Improving Imitation Learning with Selective State Space Models},\nauthor={Xiaogang Jia and Qian Wang and Atalay Donat and Bowen Xing and Ge Li and Hongyi Zhou and Onur Celik and Denis Blessing and Rudolf Lioutikov and Gerhard Neumann},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=IssXUYvVTg}\n}", "github": "https://github.com/ALRhub/MaIL", "project": "", "reviewers": "k2cG;Vk9K;mQZR;u9Yp", "site": "https://openreview.net/forum?id=IssXUYvVTg", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "3;3;5;3", "rating_avg": 2.75, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 10, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1308103634215294942&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0;1;0;1;0;0;1", "aff_unique_norm": "Karlsruher Institut f\u00fcr Technologie;Karlsruhe Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.kit.edu;https://www.kit.edu", "aff_unique_abbr": "KIT;KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "id": "JScswMfEQ0", "title": "Mobility VLA: Multimodal Instruction Navigation with Long-Context VLMs and Topological Graphs", "track": "main", "status": "Poster", "tldr": "", "abstract": "An elusive goal in navigation research is to build an intelligent agent that can understand multimodal instructions including natural language and image, and perform useful navigation. To achieve this, we study a widely useful category of navigation tasks we call Multimodal Instruction Navigation with demonstration Tours (MINT), in which the environment prior is provided through a previously recorded demonstration video. Recent advances in Vision Language Models (VLMs) have shown a promising path in achieving this goal as it demonstrates capabilities in perceiving and reasoning about multimodal inputs. However, VLMs are typically trained to predict textual output and it is an open research question about how to best utilize them in navigation. To solve MINT, we present Mobility VLA, a hierarchical Vision-Language-Action (VLA) navigation policy that combines the environment understanding and common sense reasoning power of long-context VLMs and a robust low-level navigation policy based on topological graphs. The high-level policy consists of a long-context VLM that takes the demonstration tour video and the multimodal user instruction as input to find the goal frame in the tour video. Next, a low-level policy uses the goal frame and an offline constructed topological graph to generate robot actions at every timestep. We evaluated Mobility VLA in a 836$m^2$ real world environment and show that Mobility VLA has a high end-to-end success rates on previously unsolved multimodal instructions such as ``Where should I return this?'' while holding a plastic bin.", "keywords": "vision-language navigation;multimodal foundation models;long-context reasoning", "primary_area": "", "supplementary_material": "/attachment/248bc3dc823f65b06741954366209c6b479d4632.zip", "author": "Zhuo Xu;Hao-Tien Lewis Chiang;Zipeng Fu;Mithun George Jacob;Tingnan Zhang;Tsang-Wei Edward Lee;Wenhao Yu;Connor Schenck;David Rendleman;Dhruv Shah;Fei Xia;Jasmine Hsu;Jonathan Hoech;Pete Florence;Sean Kirmani;Sumeet Singh;Vikas Sindhwani;Carolina Parada;Chelsea Finn;Peng Xu;Sergey Levine;Jie Tan", "authorids": "~Zhuo_Xu1;~Hao-Tien_Lewis_Chiang1;~Zipeng_Fu1;mithunjacob@google.com;~Tingnan_Zhang1;~Tsang-Wei_Edward_Lee1;~Wenhao_Yu1;~Connor_Schenck2;drendleman@google.com;~Dhruv_Shah1;~Fei_Xia1;~Jasmine_Hsu1;jhoech@google.com;~Pete_Florence1;~Sean_Kirmani1;~Sumeet_Singh3;~Vikas_Sindhwani1;~Carolina_Parada2;~Chelsea_Finn1;~Peng_Xu9;~Sergey_Levine1;~Jie_Tan1", "gender": "M;M;M;;M;M;M;;;M;M;;;;M;M;M;F;F;M;M;M", "homepage": ";https://sites.google.com/view/lewispro/home;https://zipengfu.github.io;;;;https://wenhaoyu.weebly.com/;;;http://cs.berkeley.edu/~shah;;;;http://www.peteflorence.com/;https://kirmani.io/;;http://vikas.sindhwani.org;;https://ai.stanford.edu/~cbfinn/;;https://people.eecs.berkeley.edu/~svlevine/;http://www.jie-tan.net", "dblp": ";;245/1504;;https://dblp.uni-trier.de/pers/hd/z/Zhang:Tingnan;236/6317.html;;;;;;142/8537;;;;;26/4825;;131/1783;;80/7594;81/7419", "google_scholar": ";megAxigAAAAJ;wMcPTbEAAAAJ;;RM2vMNcAAAAJ;;1bF2s2kAAAAJ;;;;pqP5_PgAAAAJ;;;;iyEuK8kAAAAJ;ZGpE5cYAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;vfPE6hgAAAAJ;460NWeQAAAAJ;8R35rCwAAAAJ;neGbgzYAAAAJ", "orcid": ";;;;;;;;;;0000-0003-4343-1444;;;;;;;;;;;", "linkedin": "zhuo-xu-joe/;hao-tien-lewis-chiang-22598a79/;zipengfu;;;;;;;;;;;;skirmani;;vikassindhwani;carolinaparada;;;;jie-tan/", "or_profile": "~Zhuo_Xu1;~Hao-Tien_Lewis_Chiang1;~Zipeng_Fu1;mithunjacob@google.com;~Tingnan_Zhang1;~Tsang-Wei_Edward_Lee1;~Wenhao_Yu1;~Connor_Schenck2;drendleman@google.com;~Dhruv_Shah1;~Fei_Xia1;~Jasmine_Hsu1;jhoech@google.com;~Pete_Florence1;~Sean_Kirmani1;~Sumeet_Singh3;~Vikas_Sindhwani1;~Carolina_Parada2;~Chelsea_Finn1;~Peng_Xu9;~Sergey_Levine1;~Jie_Tan1", "aff": "Google DeepMind;Google Deepmind;Stanford University;;Google;;Google;;;UC Berkeley;Google;New York University;;Google;Google DeepMind;Google Brain Robotics;Google;Google DeepMind;Google;Google;Google;Google", "aff_domain": "google.com;deepmind.com;stanford.edu;;google.com;;google.com;;;berkeley.edu;google.com;nyu.edu;;google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com;google.com", "position": "Researcher;Researcher;PhD student;;Software Engineer;;Software Engineer;;;PhD student;Researcher;MS student;;Research Scientist;Researcher;Researcher;Senior Staff Research Scientist;Principal Researcher;Research Scientist;Researcher;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nxu2024mobility,\ntitle={Mobility {VLA}: Multimodal Instruction Navigation with Long-Context {VLM}s and Topological Graphs},\nauthor={Zhuo Xu and Hao-Tien Lewis Chiang and Zipeng Fu and Mithun George Jacob and Tingnan Zhang and Tsang-Wei Edward Lee and Wenhao Yu and Connor Schenck and David Rendleman and Dhruv Shah and Fei Xia and Jasmine Hsu and Jonathan Hoech and Pete Florence and Sean Kirmani and Sumeet Singh and Vikas Sindhwani and Carolina Parada and Chelsea Finn and Peng Xu and Sergey Levine and Jie Tan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JScswMfEQ0}\n}", "github": "", "project": "", "reviewers": "395i;y2wr;21ra", "site": "https://openreview.net/forum?id=JScswMfEQ0", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 22, "corr_rating_confidence": 0.0, "gs_citation": 20, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12049447589331951841&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;2;0;0;3;0;4;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Google;DeepMind;Stanford University;University of California, Berkeley;New York University", "aff_unique_dep": "Google DeepMind;DeepMind;;;", "aff_unique_url": "https://deepmind.com;https://deepmind.com;https://www.stanford.edu;https://www.berkeley.edu;https://www.nyu.edu", "aff_unique_abbr": "DeepMind;DeepMind;Stanford;UC Berkeley;NYU", "aff_campus_unique_index": "1;2;2;3;2;2;2;2;2;2;2;2", "aff_campus_unique": ";Stanford;Mountain View;Berkeley", "aff_country_unique_index": "0;0;1;1;1;1;1;1;1;0;1;1;0;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "JZzaRY8m8r", "title": "KOI: Accelerating Online Imitation Learning via Hybrid Key-state Guidance", "track": "main", "status": "Poster", "tldr": "", "abstract": "Online Imitation Learning methods struggle with the gap between extensive online exploration space and limited expert trajectories, which hinder efficient exploration due to inaccurate task-aware reward estimation.\n Inspired by the findings from cognitive neuroscience that task decomposition could facilitate cognitive processing for efficient learning, we hypothesize that an agent could estimate precise task-aware imitation rewards for efficient online exploration by decomposing the target task into the objectives of \"what to do\" and the mechanisms of \"how to do\".\n In this work, we introduce the hybrid Key-state guided Online Imitation (KOI) learning approach, which leverages the integration of semantic and motion key states as guidance for task-aware reward estimation.\n Initially, we utilize the visual-language models to segment the expert trajectory into semantic key states, indicating the objectives of \"what to do\". \n Within the intervals between semantic key states, optical flow is employed to capture motion key states to understand the process of \"how to do\".\n By integrating a thorough grasp of both semantic and motion key states, we refine the trajectory-matching reward computation, encouraging task-aware exploration for efficient online imitation learning.\n Our experiment results prove that our method is more sample efficient than previous state-of-the-art approaches in the Meta-World and LIBERO environments. We also conduct real-world robotic manipulation experiments to validate the efficacy of our method, demonstrating the practical applicability of our KOI method.", "keywords": "Online Imitation Learning; Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/338e0703da548da98aea7ccb4944fc140bfec3bd.zip", "author": "Jingxian Lu;Wenke Xia;Dong Wang;Zhigang Wang;Bin Zhao;Di Hu;Xuelong Li", "authorids": "~Jingxian_Lu1;~Wenke_Xia1;~Dong_Wang1;~Zhigang_Wang3;~Bin_Zhao7;~Di_Hu1;~Xuelong_Li2", "gender": ";M;M;M;M;M;M", "homepage": ";http://www.kaito.org.cn/;https://redwang.github.io/;https://iopen.nwpu.edu.cn/info/1347/2105.htm;https://dtaoo.github.io/;;", "dblp": "69/8235;337/9800;40/3934-28;73/4325-1.html;49/8496-1;l/XuelongLi;35/1989-2", "google_scholar": ";https://scholar.google.co.il/citations?user=v69hlTUAAAAJ;dasL9V4AAAAJ;https://scholar.google.com.hk/citations?user=DQB0hqwAAAAJ;https://scholar.google.com.hk/citations?user=F7bvTOEAAAAJ;ahUibskAAAAJ;cw3EaAYAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Jingxian_Lu1;~Wenke_Xia1;~Dong_Wang1;~Bin_Zhao7;~Di_Hu1;~Xuelong_Li2;~Zhi.gang_Wang1", "aff": "Renmin University of China;Renmin University of China;Shanghai AI Laboratory;Northwest Polytechnical University Xi'an;Renmin University of China;Northwestern Polytechnical University;Shanghai AI Lab", "aff_domain": "ruc.edu.cn;ruc.edu.cn;pjlab.org.cn;nwpu.edu.cn;ruc.edu.cn;nwpu.edu.cn;pjlab.org.cn", "position": "Undergrad student;PhD student;Researcher;Associate Professor;Associate Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nlu2024koi,\ntitle={{KOI}: Accelerating Online Imitation Learning via Hybrid Key-state Guidance},\nauthor={Jingxian Lu and Wenke Xia and Dong Wang and Zhigang Wang and Bin Zhao and Di Hu and Xuelong Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=JZzaRY8m8r}\n}", "github": "https://github.com/GeWu-Lab/Keystate_Online_Imitation", "project": "", "reviewers": "raZY;APBJ;3bAV;5oNU", "site": "https://openreview.net/forum?id=JZzaRY8m8r", "pdf_size": 0, "rating": "3;3;4;4", "confidence": "3;3;4;4", "rating_avg": 3.5, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 1.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:58SkzN2YIOgJ:scholar.google.com/&scioq=KOI:+Accelerating+Online+Imitation+Learning+via+Hybrid+Key-state+Guidance&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0;3;4", "aff_unique_norm": "Renmin University of China;Shanghai AI Laboratory;Northwest Polytechnical University;Northwestern Polytechnical University;Shanghai AI Lab", "aff_unique_dep": ";;;;", "aff_unique_url": "http://www.ruc.edu.cn;https://www.shanghai-ai-lab.com;http://www.nwpu.edu.cn;https://www.nwpu.edu.cn;https://www.shanghaiailab.com", "aff_unique_abbr": "RUC;SAIL;NWPU;NWPU;SAIL", "aff_campus_unique_index": "1", "aff_campus_unique": ";Xi'an", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "KAzku0Uyh1", "title": "Object-Centric Dexterous Manipulation from Human Motion Data", "track": "main", "status": "Poster", "tldr": "", "abstract": "Manipulating objects to achieve desired goal states is a basic but important skill for dexterous manipulation. Human hand motions demonstrate proficient manipulation capability, providing valuable data for training robots with multi-finger hands. Despite this potential, substantial challenges arise due to the embodiment gap between human and robot hands. In this work, we introduce a hierarchical policy learning framework that uses human hand motion data for training object-centric dexterous robot manipulation. At the core of our method is a high-level trajectory generative model, learned with a large-scale human hand motion capture dataset, to synthesize human-like wrist motions conditioned on the desired object goal states. Guided by the generated wrist motions, deep reinforcement learning is further used to train a low-level finger controller that is grounded in the robot's embodiment to physically interact with the object to achieve the goal. Through extensive evaluation across 10 household objects, our approach not only demonstrates superior performance but also showcases generalization capability to novel object geometries and goal states. Furthermore, we transfer the learned policies from simulation to a real-world bimanual dexterous robot system, further demonstrating its applicability in real-world scenarios. Project website: https://cypypccpy.github.io/obj-dex.github.io/.", "keywords": "Dexterous Manipulation;Reinforcement Learning;Learning from Human", "primary_area": "", "supplementary_material": "/attachment/1d0507fff1c1b4678328df57a197e8dafc9304df.zip", "author": "Yuanpei Chen;Chen Wang;Yaodong Yang;Karen Liu", "authorids": "~Yuanpei_Chen2;~Chen_Wang16;~Yaodong_Yang1;~Karen_Liu1", "gender": "M;M;M;", "homepage": "https://cypypccpy.github.io/;http://www.chenwangjeremy.net/;https://www.yangyaodong.com;https://cs.stanford.edu/~karenliu", "dblp": "1234567;;170/1496-1;", "google_scholar": "https://scholar.google.com/citations?hl=en;lStkAzsAAAAJ;https://scholar.google.co.uk/citations?user=6yL0xw8AAAAJ;i28fU0MAAAAJ", "orcid": "0000-0002-0033-492X;;0000-0001-8132-5613;0000-0001-5926-0905", "linkedin": ";;yaodong-yang;", "or_profile": "~Yuanpei_Chen2;~Chen_Wang16;~Yaodong_Yang1;~Karen_Liu1", "aff": "PsiRobot;Computer Science Department, Stanford University;Peking University;Computer Science Department, Stanford University", "aff_domain": "psibot.ai;cs.stanford.edu;pku.edu.cn;cs.stanford.edu", "position": "Researcher;PhD student;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nchen2024objectcentric,\ntitle={Object-Centric Dexterous Manipulation from Human Motion Data},\nauthor={Yuanpei Chen and Chen Wang and Yaodong Yang and Karen Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KAzku0Uyh1}\n}", "github": "", "project": "", "reviewers": "2mos;vhQ2;u9EE", "site": "https://openreview.net/forum?id=KAzku0Uyh1", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14775293435292914523&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "PsiRobot;Stanford University;Peking University", "aff_unique_dep": ";Computer Science Department;", "aff_unique_url": ";https://www.stanford.edu;http://www.pku.edu.cn", "aff_unique_abbr": ";Stanford;Peking U", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "1;2;1", "aff_country_unique": ";United States;China" }, { "id": "KPcX4jetMw", "title": "Reasoning Grasping via Multimodal Large Language Model", "track": "main", "status": "Poster", "tldr": "", "abstract": "Despite significant progress in robotic systems for operation within human-centric environments, existing models still heavily rely on explicit human commands to identify and manipulate specific objects. This limits their effectiveness in environments where understanding and acting on implicit human intentions are crucial. In this study, we introduce a novel task: reasoning grasping, where robots need to generate grasp poses based on indirect verbal instructions or intentions. To accomplish this, we propose an end-to-end reasoning grasping model that integrates a multimodal Large Language Model (LLM) with a vision-based robotic grasping framework. In addition, we present the first reasoning grasping benchmark dataset generated from the GraspNet-1 billion, incorporating implicit instructions for object-level and part-level grasping, and this dataset will soon be available for public access. Our results show that directly integrating CLIP or LLaVA with the grasp detection model performs poorly on the challenging reasoning grasping tasks, while our proposed model demonstrates significantly enhanced performance both in the reasoning grasping benchmark and real-world experiments.", "keywords": "Robotics Grasping;Multimodal Large Language Model", "primary_area": "", "supplementary_material": "/attachment/dac505c3a8b0c884d13ada8cc4f6cbec84316a14.zip", "author": "Shiyu Jin;JINXUAN XU;Yutian Lei;Liangjun Zhang", "authorids": "~Shiyu_Jin1;~JINXUAN_XU2;~Yutian_Lei1;~Liangjun_Zhang1", "gender": "M;F;M;M", "homepage": ";https://www.jinxuanxu.com;https://doublelei.me/;https://www.cs.unc.edu/~zlj", "dblp": ";;;", "google_scholar": "GdYgso8AAAAJ;;;Byzk604AAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Shiyu_Jin1;~JINXUAN_XU2;~Yutian_Lei1;~Liangjun_Zhang1", "aff": ";Rutgers University, New Brunswick;;Research, Baidu", "aff_domain": ";rutgers.edu;;research.baidu.com", "position": ";PhD student;;Researcher", "bibtex": "@inproceedings{\njin2024reasoning,\ntitle={Reasoning Grasping via Multimodal Large Language Model},\nauthor={Shiyu Jin and JINXUAN XU and Yutian Lei and Liangjun Zhang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KPcX4jetMw}\n}", "github": "", "project": "", "reviewers": "NKtc;ucUd;YHuz", "site": "https://openreview.net/forum?id=KPcX4jetMw", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.5, "gs_citation": 25, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6946657483134231494&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Rutgers University;Baidu", "aff_unique_dep": ";Research", "aff_unique_url": "https://www.rutgers.edu;https://www.baidu.com", "aff_unique_abbr": "Rutgers;Baidu", "aff_campus_unique_index": "0", "aff_campus_unique": "New Brunswick;", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "id": "KULBk5q24a", "title": "CoViS-Net: A Cooperative Visual Spatial Foundation Model for Multi-Robot Applications", "track": "main", "status": "Poster", "tldr": "", "abstract": "Autonomous robot operation in unstructured environments is often underpinned by spatial understanding through vision. Systems composed of multiple concurrently operating robots additionally require access to frequent, accurate and reliable pose estimates. Classical vision-based methods to regress relative pose are commonly computationally expensive (precluding real-time applications), and often lack data-derived priors for resolving ambiguities. In this work, we propose CoViS-Net, a cooperative, multi-robot visual spatial foundation model that learns spatial priors from data, enabling pose estimation as well as general spatial comprehension. Our model is fully decentralized, platform-agnostic, executable in real-time using onboard compute, and does not require existing networking infrastructure. CoViS-Net provides relative pose estimates and a local bird's-eye-view (BEV) representation, even without camera overlap between robots, and can predict BEV representations of unseen regions. We demonstrate its use in a multi-robot formation control task across various real-world settings. We provide supplementary material online and will open source our trained model in due course.\nhttps://sites.google.com/view/covis-net", "keywords": "Multi-Robot Systems;Robot Perception;Foundation Models", "primary_area": "", "supplementary_material": "/attachment/7c08defc300ff8ecaee1f260c82fc05ea9508379.zip", "author": "Jan Blumenkamp;Steven Morad;Jennifer Gielis;Amanda Prorok", "authorids": "~Jan_Blumenkamp1;~Steven_Morad1;jag233@cl.cam.ac.uk;~Amanda_Prorok1", "gender": "M;M;;", "homepage": ";http://www.dangersteve.com/home;;", "dblp": ";247/9311;;", "google_scholar": "GvdaoD4AAAAJ;KvCgriAAAAAJ;;", "orcid": ";0000-0002-8413-2953;;", "linkedin": "janblumenkamp/;;;", "or_profile": "~Jan_Blumenkamp1;~Steven_Morad1;jag233@cl.cam.ac.uk;~Amanda_Prorok1", "aff": "University of Cambridge;University of Cambridge;;", "aff_domain": "cam.ac.uk;cam.ac.uk;;", "position": "PhD student;PhD student;;", "bibtex": "@inproceedings{\nblumenkamp2024covisnet,\ntitle={CoViS-Net: A Cooperative Visual Spatial Foundation Model for Multi-Robot Applications},\nauthor={Jan Blumenkamp and Steven Morad and Jennifer Gielis and Amanda Prorok},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KULBk5q24a}\n}", "github": "https://github.com/proroklab/CoViS-Net", "project": "", "reviewers": "3vko;mxQi;vrDg;bCfq", "site": "https://openreview.net/forum?id=KULBk5q24a", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "5;4;2;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9034084887202234699&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Cambridge", "aff_unique_dep": "", "aff_unique_url": "https://www.cam.ac.uk", "aff_unique_abbr": "Cambridge", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Cambridge", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "KXsropnmNI", "title": "Transferable Tactile Transformers for Representation Learning Across Diverse Sensors and Tasks", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper presents T3: Transferable Tactile Transformers, a framework for tactile representation learning that scales across multi-sensors and multi-tasks.T3 is designed to overcome the contemporary issue that camera-based tactile sensing is extremely heterogeneous, i.e. sensors are built into different form factors, and existing datasets were collected for disparate tasks. T3 captures the shared latent information across different sensor-task pairings by constructing a shared trunk transformer with sensor-specific encoders and task-specific decoders. The pre-training of T3utilizes a novel Foundation Tactile (FoTa) dataset, which is aggregated from several open-sourced datasets and it contains over 3 million data points gathered from 13 sensors and 11 tasks. FoTa is the largest and most diverse dataset in tactile sensing to date and it is made publicly available in a unified format. Across various sensors and tasks, experiments show that T3 pre-trained with FoTa achieved zero-shot transferability in certain sensor-task pairings, can be further fine-tuned with small amounts of domain-specific data, and its performance scales with bigger network sizes. T3 is also effective as a tactile encoder for long horizon contact-rich manipulation. Results from sub-millimeter multi-pin electronics insertion tasks show that T3 achieved a task success rate 25% higher than that of policies trained with tactile encoders trained from scratch, or 53% higher than without tactile sensing. Data, code, and model checkpoints are open-sourced at https://t3.alanz.info.", "keywords": "Tactile Sensing;Representation Learning;Heterogeneous Learning;Robot Manipulation;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/5f62a557eb107ca7de1751beb7282546a2b7f99c.zip", "author": "Jialiang Zhao;Yuxiang Ma;Lirui Wang;Edward Adelson", "authorids": "~Jialiang_Zhao1;~Yuxiang_Ma1;~Lirui_Wang1;~Edward_Adelson1", "gender": "M;M;M;M", "homepage": "https://alanz.info/;https://yuxiang-ma.github.io/;https://liruiw.github.io/;http://persci.mit.edu/people/adelson", "dblp": "204/1900;;221/9612;", "google_scholar": "LaW7igYAAAAJ;HmPiPZEAAAAJ;EM9YhH0AAAAJ;", "orcid": ";;;0000-0003-2222-6775", "linkedin": "jialiang-zhao/;;;", "or_profile": "~Jialiang_Zhao1;~Yuxiang_Ma1;~Lirui_Wang1;~Edward_Adelson1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu", "position": "PhD student;PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nzhao2024transferable,\ntitle={Transferable Tactile Transformers for Representation Learning Across Diverse Sensors and Tasks},\nauthor={Jialiang Zhao and Yuxiang Ma and Lirui Wang and Edward Adelson},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KXsropnmNI}\n}", "github": "https://github.com/alanzjl/t3", "project": "", "reviewers": "aci4;V6xF;1X3T", "site": "https://openreview.net/forum?id=KXsropnmNI", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7776981543618384609&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "KcW31O0PtL", "title": "Hint-AD: Holistically Aligned Interpretability in End-to-End Autonomous Driving", "track": "main", "status": "Poster", "tldr": "", "abstract": "End-to-end architectures in autonomous driving (AD) face a significant challenge in interpretability, impeding human-AI trust. Human-friendly natural language has been explored for tasks such as driving explanation and 3D captioning. However, previous works primarily focused on the paradigm of declarative interpretability, where the natural language interpretations are not grounded in the intermediate outputs of AD systems, making the interpretations only declarative. In contrast, aligned interpretability establishes a connection between language and the intermediate outputs of AD systems. Here we introduce Hint-AD, an integrated AD-language system that generates language aligned with the holistic perception-prediction-planning outputs of the AD model. By incorporating the intermediate outputs and a holistic token mixer sub-network for effective feature adaptation, Hint-AD achieves desirable accuracy, achieving state-of-the-art results in driving language tasks including driving explanation, 3D dense captioning, and command prediction. To facilitate further study on driving explanation task on nuScenes, we also introduce a human-labeled dataset, Nu-X. Codes, dataset, and models are publicly available at https://anonymous.4open.science/r/Hint-AD-1385/.", "keywords": "Interpretability;Language alignment;Autonomous driving", "primary_area": "", "supplementary_material": "/attachment/71f23fe2a4e89342f15378ddd076a1cc61c1705a.zip", "author": "Kairui Ding;Boyuan Chen;Yuchen Su;Huan-ang Gao;Bu Jin;Chonghao Sima;Xiaohui Li;Wuqiang Zhang;Paul Barsch;Hongyang Li;Hao Zhao", "authorids": "~Kairui_Ding1;~Boyuan_Chen5;~Yuchen_Su4;~Huan-ang_Gao1;~Bu_Jin1;~Chonghao_Sima1;~Xiaohui_Li4;~Wuqiang_Zhang1;~Paul_Barsch1;~Hongyang_Li1;~Hao_Zhao1", "gender": "M;M;F;M;;;;M;M;M;M", "homepage": "https://github.com/Robot-K;https://github.com/by-luckk/;https://github.com/SYC2004;https://c7w.tech/about;;;;https://github.com/WQZhang23;;https://datascience.hku.hk/people/hongyang-li/;https://sites.google.com/view/fromandto", "dblp": ";;;339/0975;;317/0445;;;;95/8433-1;08/3737-2.html", "google_scholar": ";;;WvbKfLgAAAAJ;;dgYJ6esAAAAJ;syvfrvgAAAAJ;https://scholar.google.com/citations?view_op=list_works;;https://scholar.google.com.hk/citations?user=Hfrih1EAAAAJ;ygQznUQAAAAJ", "orcid": ";;;;;;;;;0000-0001-9110-5534;", "linkedin": ";;;;;;;;paul-barsch;hongyangli2020/;", "or_profile": "~Kairui_Ding1;~Boyuan_Chen5;~Yuchen_Su4;~Huan-ang_Gao1;~Bu_Jin1;~Chonghao_Sima1;~Xiaohui_Li4;~Wuqiang_Zhang1;~Paul_Barsch1;~Hongyang_Li1;~Hao_Zhao1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;Shanghai AI Lab;Dalian University of Technology;Karlsruher Institut f\u00fcr Technologie;Technische Universit\u00e4t Dresden;Shanghai AI Lab;Peking University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;cs.tsinghua.edu.cn;;pjlab.org.cn;dlut.edu.cn;kit.edu;tu-dresden.de;pjlab.org.cn;pku.edu.cn", "position": "Undergrad student;Undergrad student;Undergrad student;Undergrad student;;Researcher;MS student;MS student;MS student;Researcher;Postdoc", "bibtex": "@inproceedings{\nding2024hintad,\ntitle={Hint-{AD}: Holistically Aligned Interpretability in End-to-End Autonomous Driving},\nauthor={Kairui Ding and Boyuan Chen and Yuchen Su and Huan-ang Gao and Bu Jin and Chonghao Sima and Xiaohui Li and Wuqiang Zhang and Paul Barsch and Hongyang Li and Hao Zhao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KcW31O0PtL}\n}", "github": "", "project": "", "reviewers": "3Cfu;dfFt;kWqV;Kmwf;HNsL;rKXi", "site": "https://openreview.net/forum?id=KcW31O0PtL", "pdf_size": 0, "rating": "3;3;3;3;3;4", "confidence": "4;4;4;3;3;3", "rating_avg": 3.1666666666666665, "confidence_avg": 3.5, "replies_avg": 8, "authors#_avg": 11, "corr_rating_confidence": -0.4472135954999579, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12358155248329955618&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1;2;3;4;1;5", "aff_unique_norm": "Tsinghua University;Shanghai AI Lab;Dalian University of Technology;Karlsruher Institut f\u00fcr Technologie;Technische Universit\u00e4t Dresden;Peking University", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.shanghaiailab.com;http://www.dlut.edu.cn/;https://www.kit.edu;https://tu-dresden.de;http://www.pku.edu.cn", "aff_unique_abbr": "THU;SAIL;DUT;KIT;TUD;Peking U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;1;1;0;0", "aff_country_unique": "China;Germany" }, { "id": "KdVLK0Wo5z", "title": "PoliFormer: Scaling On-Policy RL with Transformers Results in Masterful Navigators", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present PoliFormer (Policy Transformer), an RGB-only indoor navigation agent trained end-to-end with reinforcement learning at scale that generalizes to the real-world without adaptation despite being trained purely in simulation. PoliFormer uses a foundational vision transformer encoder with a causal transformer decoder enabling long-term memory and reasoning. It is trained for hundreds of millions of interactions across diverse environments, leveraging parallelized, multi-machine rollouts for efficient training with high throughput. PoliFormer is a masterful navigator, producing state-of-the-art results across two distinct embodiments, the LoCoBot and Stretch RE-1 robots, and four navigation benchmarks. It breaks through the plateaus of previous work, achieving an unprecedented 85.5% success rate in object goal navigation on the CHORES-S benchmark, a 28.5% absolute improvement. PoliFormer can also be trivially extended to a variety of downstream applications such as object tracking, multi-object navigation, and open-vocabulary navigation with no finetuning.", "keywords": "Embodied Navigation;On-Policy RL;Transformer Policy", "primary_area": "", "supplementary_material": "/attachment/6d32fa27b6f91f01a53b225d0cf1274b50c9ea38.zip", "author": "Kuo-Hao Zeng;Zichen Zhang;Kiana Ehsani;Rose Hendrix;Jordi Salvador;Alvaro Herrasti;Ross Girshick;Aniruddha Kembhavi;Luca Weihs", "authorids": "~Kuo-Hao_Zeng3;~Zichen_Zhang2;~Kiana_Ehsani1;~Rose_Hendrix1;~Jordi_Salvador3;~Alvaro_Herrasti1;~Ross_Girshick1;~Aniruddha_Kembhavi1;~Luca_Weihs1", "gender": "M;M;F;F;;;M;M;M", "homepage": "https://kuohaozeng.github.io;https://zcczhang.github.io/;https://ehsanik.github.io/;;;;http://www.rossgirshick.info/;https://anikem.github.io/;https://lucaweihs.github.io/", "dblp": "185/0743;344/3966;198/0910;236/4851;53/5830;178/0393;89/7658;81/7583;203/6449", "google_scholar": "SRWelkkAAAAJ;https://scholar.google.com/citations?view_op=list_works;RScZCLEAAAAJ;TIPqRC0AAAAJ;https://scholar.google.de/citations?user=YuRVs2oAAAAJ;;W8VIEZgAAAAJ;JnUevM0AAAAJ;F_RBceUAAAAJ", "orcid": ";;;;;;;;0000-0002-6846-6718", "linkedin": "%E5%9C%8B%E8%B1%AA-%E6%9B%BE-0165b7b9/?locale=en_US;;kiana-ehsani-1b81b0162/;;;;;;", "or_profile": "~Kuo-Hao_Zeng3;~Zichen_Zhang2;~Kiana_Ehsani1;~Rose_Hendrix1;~Jordi_Salvador3;~Alvaro_Herrasti1;~Ross_Girshick1;~Aniruddha_Kembhavi1;~Luca_Weihs1", "aff": "Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Allen Institute for AI;Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence;Allen Institute for Artificial Intelligence", "aff_domain": "allenai.org;allenai.org;allenai.org;allenai.org;allenai.org;allenai.org;allenai.org;allenai.org;allenai.org", "position": "Research Scientist;Researcher;Researcher;Research Engineer;Research Engineer;Researcher;Researcher;Research Manager;Research Scientist", "bibtex": "@inproceedings{\nzeng2024poliformer,\ntitle={PoliFormer: Scaling On-Policy {RL} with Transformers Results in Masterful Navigators},\nauthor={Kuo-Hao Zeng and Zichen Zhang and Kiana Ehsani and Rose Hendrix and Jordi Salvador and Alvaro Herrasti and Ross Girshick and Aniruddha Kembhavi and Luca Weihs},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=KdVLK0Wo5z}\n}", "github": "", "project": "", "reviewers": "sGZ1;jeoX;Ax82;bADd", "site": "https://openreview.net/forum?id=KdVLK0Wo5z", "pdf_size": 0, "rating": "4;4;4;4", "confidence": "3;3;4;5", "rating_avg": 4.0, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3437734109522154516&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1;0;0;0;0", "aff_unique_norm": "Allen Institute for Artificial Intelligence;Allen Institute for AI", "aff_unique_dep": ";", "aff_unique_url": "https://allenai.org;https://allenai.org", "aff_unique_abbr": "AI2;AI2", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Ke5xrnBFAR", "title": "Gameplay Filters: Robust Zero-Shot Safety through Adversarial Imagination", "track": "main", "status": "Poster", "tldr": "", "abstract": "Despite the impressive recent advances in learning-based robot control, ensuring robustness to out-of-distribution conditions remains an open challenge. Safety filters can, in principle, keep arbitrary control policies from incurring catastrophic failures by overriding unsafe actions, but existing solutions for complex (e.g., legged) robot dynamics do not span the full motion envelope and instead rely on local, reduced-order models. These filters tend to overly restrict agility and can still fail when perturbed away from nominal conditions. This paper presents the gameplay filter, a new class of predictive safety filter that continually plays out hypothetical matches between its simulation-trained safety strategy and a virtual adversary co-trained to invoke worst-case events and sim-to-real error, and precludes actions that would cause failures down the line. We demonstrate the scalability and robustness of the approach with a first-of-its-kind full-order safety filter for (36-D) quadrupedal dynamics. Physical experiments on two different quadruped platforms demonstrate the superior zero-shot effectiveness of the gameplay filter under large perturbations such as tugging and unmodeled terrain. Experiment videos and open-source software are available online: https://saferobotics.org/research/gameplay-filter", "keywords": "Robust Safety;Adversarial Reinforcement Learning;Game Theory", "primary_area": "", "supplementary_material": "/attachment/ec325a9bef6062658a6a63439d68e95c6736c58a.zip", "author": "Duy Phuong Nguyen;Kai-Chieh Hsu;Wenhao Yu;Jie Tan;Jaime Fern\u00e1ndez Fisac", "authorids": "~Duy_Phuong_Nguyen2;~Kai-Chieh_Hsu1;~Wenhao_Yu1;~Jie_Tan1;~Jaime_Fern\u00e1ndez_Fisac1", "gender": "M;M;M;M;M", "homepage": ";https://kaichiehhsu.github.io/;https://wenhaoyu.weebly.com/;http://www.jie-tan.net;https://ee.princeton.edu/people/jaime-fernandez-fisac", "dblp": ";;;81/7419;156/0109", "google_scholar": "BpR72EcAAAAJ;;1bF2s2kAAAAJ;neGbgzYAAAAJ;iAq_9tEAAAAJ", "orcid": ";;;;0000-0002-2676-5090", "linkedin": "buzinguyen/;;;jie-tan/;jaime-fisac-134341b0/", "or_profile": "~Duy_Phuong_Nguyen2;~Kai-Chieh_Hsu1;~Wenhao_Yu1;~Jie_Tan1;~Jaime_Fern\u00e1ndez_Fisac1", "aff": "Princeton University;Princeton University;Google;Google;Princeton University", "aff_domain": "princeton.edu;princeton.edu;google.com;google.com;princeton.edu", "position": "PhD student;PhD student;Software Engineer;Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\nnguyen2024gameplay,\ntitle={Gameplay Filters: Robust Zero-Shot Safety through Adversarial Imagination},\nauthor={Duy Phuong Nguyen and Kai-Chieh Hsu and Wenhao Yu and Jie Tan and Jaime Fern{\\'a}ndez Fisac},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Ke5xrnBFAR}\n}", "github": "", "project": "", "reviewers": "ScUX;ZswE;tETa", "site": "https://openreview.net/forum?id=Ke5xrnBFAR", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16212906071008206833&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;1;1;0", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.princeton.edu;https://www.google.com", "aff_unique_abbr": "Princeton;Google", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "L4p6zTlj6k", "title": "TidyBot++: An Open-Source Holonomic Mobile Manipulator for Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Exploiting the promise of recent advances in imitation learning for mobile manipulation will require the collection of large numbers of human-guided demonstrations. This paper proposes an open-source design for an inexpensive, robust, and flexible mobile manipulator that can support arbitrary arms, enabling a wide range of real-world household mobile manipulation tasks. Crucially, our design uses powered casters to enable the mobile base to be fully holonomic, able to control all planar degrees of freedom independently and simultaneously. This feature makes the base more maneuverable and simplifies many mobile manipulation tasks, eliminating the kinematic constraints that create complex and time-consuming motions in nonholonomic bases. We equip our robot with an intuitive mobile phone teleoperation interface to enable easy data acquisition for imitation learning. In our experiments, we use this interface to collect data and show that the resulting learned policies can successfully perform a variety of common household mobile manipulation tasks.", "keywords": "mobile manipulation;imitation learning;holonomic drive", "primary_area": "", "supplementary_material": "/attachment/b38f10c809a116b09ddd9b9d62945e75f061ade4.zip", "author": "Jimmy Wu;William Chong;Robert Holmberg;Aaditya Prasad;Yihuai Gao;Oussama Khatib;Shuran Song;Szymon Rusinkiewicz;Jeannette Bohg", "authorids": "~Jimmy_Wu1;wmchong@stanford.edu;holmbergbob@gmail.com;~Aaditya_Prasad2;~Yihuai_Gao1;~Oussama_Khatib1;~Shuran_Song3;~Szymon_Rusinkiewicz2;~Jeannette_Bohg1", "gender": "M;;;M;M;M;F;;", "homepage": "http://jimmyyhwu.github.io;;;;;http://cs.stanford.edu/group/manips/people/oussama-khatib/;https://shurans.github.io/;https://www.cs.princeton.edu/~smr/;https://web.stanford.edu/~bohg/", "dblp": "00/8739;;;;;;;61/5465.html;52/7377", "google_scholar": "UoQdAc4AAAAJ;;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=4arkOLcAAAAJ;https://scholar.google.com/citations?hl=en;RaScARwAAAAJ;rjnJnEkAAAAJ", "orcid": ";;;;;;;0000-0002-4253-2588;0000-0002-4921-7193", "linkedin": ";;;aaditya-prasad/;yihuai-gao-167711245/;;;;", "or_profile": "~Jimmy_Wu1;wmchong@stanford.edu;holmbergbob@gmail.com;~Aaditya_Prasad2;~Yihuai_Gao1;~Oussama_Khatib1;~Shuran_Song3;~Szymon_Rusinkiewicz2;~Jeannette_Bohg1", "aff": "Princeton University;;;Stanford University;Stanford University;Stanford University;Stanford University;Princeton University;Stanford University", "aff_domain": "princeton.edu;;;stanford.edu;stanford.edu;;stanford.edu;princeton.edu;stanford.edu", "position": "PhD student;;;Undergrad student;PhD student;Full Professor;Assistant Professor;Professor;Assistant Professor", "bibtex": "@inproceedings{\nwu2024tidybot,\ntitle={TidyBot++: An Open-Source Holonomic Mobile Manipulator for Robot Learning},\nauthor={Jimmy Wu and William Chong and Robert Holmberg and Aaditya Prasad and Yihuai Gao and Oussama Khatib and Shuran Song and Szymon Rusinkiewicz and Jeannette Bohg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=L4p6zTlj6k}\n}", "github": "https://github.com/jimmyyhwu/tidybot2", "project": "", "reviewers": "j6ug;NmwR;rKYA", "site": "https://openreview.net/forum?id=L4p6zTlj6k", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 4, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10664432624962332847&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 8, "aff_unique_index": "0;1;1;1;1;0;1", "aff_unique_norm": "Princeton University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.princeton.edu;https://www.stanford.edu", "aff_unique_abbr": "Princeton;Stanford", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "LZh48DTg71", "title": "Evaluating Real-World Robot Manipulation Policies in Simulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "The field of robotics has made significant advances towards generalist robot manipulation policies. However, real-world evaluation of such policies is not scalable and faces reproducibility challenges, issues that are likely to worsen as policies broaden the spectrum of tasks they can perform. In this work, we demonstrate that simulation-based evaluation can be a scalable, reproducible, and reliable proxy for real-world evaluation. We identify control and visual disparities between real and simulated environments as key challenges for reliable simulated evaluation and propose approaches for mitigating these gaps without needing to painstakingly craft full-fidelity digital twins. We then employ these techniques to create SIMPLER, a collection of simulated environments for policy evaluation on common real robot manipulation setups. Through over 1500 paired sim-and-real evaluations of manipulation policies across two embodiments and eight task families, we demonstrate strong correlation between policy performance in SIMPLER environments and that in the real world. Beyond aggregated trends, we find that SIMPLER evaluations effectively reflect the real-world behaviors of individual policies, such as sensitivity to various distribution shifts. We are committed to open-sourcing all SIMPLER environments along with our workflow for creating new environments to facilitate research on general-purpose manipulation policies and simulated evaluation frameworks. Website: https://simpler-env.github.io/", "keywords": "real-to-sim;policy evaluation;robot manipulation", "primary_area": "", "supplementary_material": "/attachment/dd8cab45468d62e45732a3dac4967bb38219cf3d.zip", "author": "Xuanlin Li;Kyle Hsu;Jiayuan Gu;Oier Mees;Karl Pertsch;Homer Rich Walke;Chuyuan Fu;Ishikaa Lunawat;Isabel Sieh;Sean Kirmani;Sergey Levine;Jiajun Wu;Chelsea Finn;Hao Su;Quan Vuong;Ted Xiao", "authorids": "~Xuanlin_Li1;~Kyle_Hsu1;~Jiayuan_Gu1;~Oier_Mees1;~Karl_Pertsch1;~Homer_Rich_Walke1;~Chuyuan_Fu1;~Ishikaa_Lunawat1;~Isabel_Sieh1;~Sean_Kirmani1;~Sergey_Levine1;~Jiajun_Wu1;~Chelsea_Finn1;~Hao_Su1;~Quan_Vuong2;~Ted_Xiao1", "gender": ";M;;M;;M;F;F;F;M;M;M;F;M;M;M", "homepage": "https://xuanlinli17.github.io/;https://www.kylehsu.org;https://jiayuan-gu.github.io/;https://www.oiermees.com/;https://kpertsch.github.io/;https://homerwalke.com;;;;https://kirmani.io/;https://people.eecs.berkeley.edu/~svlevine/;https://jiajunwu.com;https://ai.stanford.edu/~cbfinn/;http://ai.ucsd.edu/~haosu;https://quanvuong.github.io;https://www.tedxiao.me", "dblp": "251/3029;217/3841;210/2429;190/8659;211/7137;279/6795;;;;;80/7594;117/4768;131/1783;09/4945-1;;198/0598", "google_scholar": "7vyVxxQAAAAJ;KCdL5B0AAAAJ;YH1v2uYAAAAJ;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;https://scholar.google.com/citations?view_op=list_works;ZWH5jCwAAAAJ;bDq7MZMAAAAJ;;;iyEuK8kAAAAJ;8R35rCwAAAAJ;2efgcS0AAAAJ;vfPE6hgAAAAJ;1P8Zu04AAAAJ;NSWI3OwAAAAJ;", "orcid": ";;0000-0002-3207-7921;;;;;;;;;0000-0002-4176-343X;;;;", "linkedin": "xuanlin-li-4684b8145/;;;oier-mees-a3069488;;;;ishikaalunawat/;isabelsieh/;skirmani;;jiajunwu/;;;;", "or_profile": "~Xuanlin_Li1;~Kyle_Hsu1;~Jiayuan_Gu1;~Oier_Mees1;~Karl_Pertsch1;~Homer_Rich_Walke1;~Chuyuan_Fu1;~Ishikaa_Lunawat1;~Isabel_Sieh1;~Sean_Kirmani1;~Sergey_Levine1;~Jiajun_Wu1;~Chelsea_Finn1;~Hao_Su1;~Quan_Vuong2;~Ted_Xiao1", "aff": "University of California, San Diego;Stanford University;University of California, San Diego;Electrical Engineering & Computer Science Department, University of California, Berkeley;Stanford University;University of California, Berkeley;Google;;Stanford University;Google DeepMind;Google;Stanford University;Google;University of California, San Diego;physical intelligence;", "aff_domain": "ucsd.edu;cs.stanford.edu;ucsd.edu;eecs.berkeley.edu;stanford.edu;berkeley.edu;google.com;;stanford.edu;google.com;google.com;stanford.edu;google.com;ucsd.edu;physicalintelligence.company;", "position": "PhD student;PhD student;PhD student;Postdoc;Postdoc;PhD student;software engineer;;Undergrad student;Researcher;Research Scientist;Assistant Professor;Research Scientist;Associate Professor;Researcher;", "bibtex": "@inproceedings{\nli2024evaluating,\ntitle={Evaluating Real-World Robot Manipulation Policies in Simulation},\nauthor={Xuanlin Li and Kyle Hsu and Jiayuan Gu and Oier Mees and Karl Pertsch and Homer Rich Walke and Chuyuan Fu and Ishikaa Lunawat and Isabel Sieh and Sean Kirmani and Sergey Levine and Jiajun Wu and Chelsea Finn and Hao Su and Quan Vuong and Ted Xiao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LZh48DTg71}\n}", "github": "https://github.com/simpler-env/SimplerEnv", "project": "", "reviewers": "mzQD;SfVk;VgL2", "site": "https://openreview.net/forum?id=LZh48DTg71", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 16, "corr_rating_confidence": 0.0, "gs_citation": 67, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2706763605735290914&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;1;2;3;1;3;3;1;3;0;4", "aff_unique_norm": "University of California, San Diego;Stanford University;University of California, Berkeley;Google;Physical Intelligence", "aff_unique_dep": ";;Electrical Engineering & Computer Science Department;Google;", "aff_unique_url": "https://www.ucsd.edu;https://www.stanford.edu;https://www.berkeley.edu;https://www.google.com;", "aff_unique_abbr": "UCSD;Stanford;UC Berkeley;Google;", "aff_campus_unique_index": "0;1;0;2;1;2;3;1;3;1;3;0", "aff_campus_unique": "San Diego;Stanford;Berkeley;Mountain View;", "aff_country_unique_index": "0;0;0;0;0;0;0;0;1;0;0;0;0", "aff_country_unique": "United States;United Kingdom;" }, { "id": "LiwdXkMsDv", "title": "Uncertainty-Aware Decision Transformer for Stochastic Driving Environments", "track": "main", "status": "Poster", "tldr": "", "abstract": "Offline Reinforcement Learning (RL) enables policy learning without active interactions, making it especially appealing for self-driving tasks. Recent successes of Transformers inspire casting offline RL as sequence modeling, which, however, fails in stochastic environments with incorrect assumptions that identical actions can consistently achieve the same goal. In this paper, we introduce an UNcertainty-awaRE deciSion Transformer (UNREST) for planning in stochastic driving environments without introducing additional transition or complex generative models. Specifically, UNREST estimates uncertainties by conditional mutual information between transitions and returns. Discovering 'uncertainty accumulation' and 'temporal locality' properties of driving environments, we replace the global returns in decision transformers with truncated returns less affected by environments to learn from actual outcomes of actions rather than environment transitions. We also dynamically evaluate uncertainty at inference for cautious planning. Extensive experiments demonstrate UNREST's superior performance in various driving scenarios and the power of our uncertainty estimation strategy.", "keywords": "Self-Driving;Decision Transformer;Uncertainty-Aware Planning", "primary_area": "", "supplementary_material": "/attachment/f6cf28b1e4448ce389d2afba5dad53cf28a6f38a.zip", "author": "Zenan Li;Fan Nie;Qiao Sun;Fang Da;Hang Zhao", "authorids": "~Zenan_Li4;~Fan_Nie1;~Qiao_Sun1;~Fang_Da2;~Hang_Zhao1", "gender": "M;;M;Not Specified;M", "homepage": "https://github.com/Emiyalzn;;https://qiaosun.me;;http://www.mit.edu/~hangzhao/", "dblp": ";;10/6242;131/6851;", "google_scholar": ";;D1KNQasAAAAJ;https://scholar.google.com/citations?hl=en;DmahiOYAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Zenan_Li4;~Fan_Nie1;~Qiao_Sun1;~Fang_Da2;~Hang_Zhao1", "aff": ";;;QCraft Inc;Tsinghua University", "aff_domain": ";;;qcraft.ai;tsinghua.edu.cn", "position": ";;;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nli2024uncertaintyaware,\ntitle={Uncertainty-Aware Decision Transformer for Stochastic Driving Environments},\nauthor={Zenan Li and Fan Nie and Qiao Sun and Fang Da and Hang Zhao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LiwdXkMsDv}\n}", "github": "https://github.com/Emiyalzn/CoRL24-UNREST", "project": "", "reviewers": "Jtpp;hnBQ;1GUa", "site": "https://openreview.net/forum?id=LiwdXkMsDv", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9845160657300893873&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1", "aff_unique_norm": "QCraft Inc;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": ";https://www.tsinghua.edu.cn", "aff_unique_abbr": ";THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "United States;China" }, { "id": "Lixj7WEGEy", "title": "MBC: Multi-Brain Collaborative Control for Quadruped Robots", "track": "main", "status": "Poster", "tldr": "", "abstract": "In the field of locomotion task of quadruped robots, Blind Policy and Perceptive Policy each have their own advantages and limitations. The Blind Policy relies on preset sensor information and algorithms, suitable for known and structured environments, but it lacks adaptability in complex or unknown environments. The Perceptive Policy uses visual sensors to obtain detailed environmental information, allowing it to adapt to complex terrains, but its effectiveness is limited under occluded conditions, especially when perception fails. Unlike the Blind Policy, the Perceptive Policy is not as robust under these conditions. To address these challenges, we propose a MBC:Multi-Brain collaborative system that incorporates the concepts of Multi-Agent Reinforcement Learning and introduces collaboration between the Blind Policy and the Perceptive Policy. By applying this multi-policy collaborative model to a quadruped robot, the robot can maintain stable locomotion even when the perceptual system is impaired or observational data is incomplete. Our simulations and real-world experiments demonstrate that this system significantly improves the robot's passability and robustness against perception failures in complex environments, validating the effectiveness of multi-policy collaboration in enhancing robotic motion performance.", "keywords": "Quadruped Robots;Perception Fails;Multi-Brain Collaborative", "primary_area": "", "supplementary_material": "/attachment/ff4af201a5def7f6c89f683192983da335a87638.zip", "author": "Hang Liu;Yi Cheng;Rankun Li;Xiaowen Hu;Linqi Ye;Houde Liu", "authorids": "~Hang_Liu9;~Yi_Cheng7;~Rankun_Li1;~Xiaowen_Hu1;~Linqi_Ye1;~Houde_Liu1", "gender": "M;M;M;M;F;M", "homepage": "https://66lau.github.io/;https://chengeeee.github.io/;;https://linqi-ye.github.io/;https://www.sigs.tsinghua.edu.cn/lhd/main.htm;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": ";;0009-0000-9735-5817;;;0009-0005-3361-5778", "linkedin": ";;;;;", "or_profile": "~Hang_Liu9;~Yi_Cheng7;~Rankun_Li1;~Linqi_Ye1;~Houde_Liu1;~Kris_Hu1", "aff": "University of Michigan - Ann Arbor;Tsinghua University;Shanghai University;Shanghai University;Tsinghua University;Shanghai University", "aff_domain": "umich.edu;mails.tsinghua.edu.cn;shu.edu.cn;shu.edu.cn;mails.tsinghua.edu.cn;shu.edu.cn", "position": "MS student;MS student;China;Associate Professor;Full Professor;MS student", "bibtex": "@inproceedings{\nliu2024mbc,\ntitle={{MBC}: Multi-Brain Collaborative Control for Quadruped Robots},\nauthor={Hang Liu and Yi Cheng and Rankun Li and Xiaowen Hu and Linqi Ye and Houde Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Lixj7WEGEy}\n}", "github": "", "project": "", "reviewers": "xrV5;yXBP;mQDL", "site": "https://openreview.net/forum?id=Lixj7WEGEy", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:lONY18iVGskJ:scholar.google.com/&scioq=MBC:+Multi-Brain+Collaborative+Control+for+Quadruped+Robots&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;2;2;1;2", "aff_unique_norm": "University of Michigan;Tsinghua University;Shanghai University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.umich.edu;https://www.tsinghua.edu.cn;https://www.shu.edu.cn", "aff_unique_abbr": "UM;THU;SHU", "aff_campus_unique_index": "0", "aff_campus_unique": "Ann Arbor;", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "United States;China" }, { "id": "LmOF7UAOZ7", "title": "A Planar-Symmetric SO(3) Representation for Learning Grasp Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "Planar-symmetric hands, such as parallel grippers, are widely adopted in both research and industrial fields.\nTheir symmetry, however, introduces ambiguity and discontinuity in the SO(3) representation, which hinders both the training and inference of neural network-based grasp detectors.\nWe propose a novel SO(3) representation that can parametrize a pair of planar-symmetric poses with a single parameter set by leveraging the 2D Bingham distribution.\nWe also detail a grasp detector based on our representation, which provides a more consistent rotation output.\nAn intensive evaluation with multiple grippers and objects in both the simulation and the real world quantitatively shows our approach's contribution.", "keywords": "Grasp Detection;Rotation Representation;Parallel Gripper", "primary_area": "", "supplementary_material": "/attachment/0d80385f7583e3ae52cea6e9f849892c96188587.zip", "author": "Tianyi Ko;Takuya Ikeda;Hiroya Sato;Koichi Nishiwaki", "authorids": "~Tianyi_Ko1;~Takuya_Ikeda1;~Hiroya_Sato1;~Koichi_Nishiwaki1", "gender": "M;M;;", "homepage": ";;https://github.com/hiroya1224;", "dblp": ";126/1141;;85/6001", "google_scholar": ";HS4dPGQAAAAJ;;https://scholar.google.co.jp/citations?user=oC2CnhUAAAAJ", "orcid": "0000-0002-2576-9161;;;", "linkedin": ";;;", "or_profile": "~Tianyi_Ko1;~Takuya_Ikeda1;~Hiroya_Sato1;~Koichi_Nishiwaki1", "aff": "Woven by Toyota, Inc.;Woven by Toyota, Inc.;The University of Tokyo, Tokyo University;Woven by Toyota", "aff_domain": "woven.toyota;woven.toyota;t.u-tokyo.ac.jp;woven.toyota", "position": "Researcher;Researcher;PhD student;Principal Researcher", "bibtex": "@inproceedings{\nko2024a,\ntitle={A Planar-Symmetric {SO}(3) Representation for Learning Grasp Detection},\nauthor={Tianyi Ko and Takuya Ikeda and Hiroya Sato and Koichi Nishiwaki},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=LmOF7UAOZ7}\n}", "github": "", "project": "", "reviewers": "pqFb;Ueu5;oYED", "site": "https://openreview.net/forum?id=LmOF7UAOZ7", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5083088626923670460&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Toyota, Inc.;University of Tokyo;Toyota", "aff_unique_dep": ";;Woven", "aff_unique_url": "https://www.toyota.com;https://www.u-tokyo.ac.jp;https://www.toyota-global.com", "aff_unique_abbr": "Toyota;UTokyo;Toyota", "aff_campus_unique_index": "1", "aff_campus_unique": ";Tokyo", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "United States;Japan" }, { "id": "M0Gv07MUMU", "title": "Tokenize the World into Object-level Knowledge to Address Long-tail Events in Autonomous Driving", "track": "main", "status": "Poster", "tldr": "", "abstract": "The autonomous driving industry is increasingly adopting end-to-end learning from sensory inputs to minimize human biases in system design. Traditional end-to-end driving models, however, suffer from long-tail events due to rare or unseen inputs within their training distributions. To address this, we propose TOKEN, a novel Multi-Modal Large Language Model (MM-LLM) that tokenizes the world into object-level knowledge, enabling better utilization of LLM\u2019s reasoning capabilities to enhance autonomous vehicle planning in long-tail scenarios. TOKEN effectively alleviates data scarcity and inefficient tokenization by producing condensed and semantically enriched representations of the scene. Our results demonstrate that TOKEN excels in grounding, reasoning, and planning capabilities, outperforming existing frameworks with a 27% reduction in trajectory L2 error and a 39% decrease in collision rates in long-tail scenarios. Additionally, our work highlights the importance of representation alignment and structured reasoning in sparking the common-sense reasoning capabilities of MM-LLMs for effective planning.", "keywords": "Multi-modal LLM;Autonomous Driving;Representation Alignment", "primary_area": "", "supplementary_material": "/attachment/cfcdc500195242eb15298d1969710be605fc48c5.zip", "author": "Thomas Tian;Boyi Li;Xinshuo Weng;Yuxiao Chen;Edward Schmerling;Yue Wang;Boris Ivanovic;Marco Pavone", "authorids": "~Thomas_Tian1;~Boyi_Li1;~Xinshuo_Weng3;~Yuxiao_Chen3;~Edward_Schmerling1;~Yue_Wang2;~Boris_Ivanovic1;~Marco_Pavone1", "gender": "M;F;F;M;M;;M;M", "homepage": "https://scholar.google.com/citations?user=uY4D8-wAAAAJ&hl=en&authuser=1;https://sites.google.com/site/boyilics/home;http://www.xinshuoweng.com;;https://yuewang.xyz;http://www.borisivanovic.com/;https://web.stanford.edu/~pavone/;https://research.nvidia.com/person/yuxiao-chen", "dblp": ";;192/1952.html;143/7326;33/4822-41;203/8356;91/3382-1.html;158/4934-1", "google_scholar": ";;dthSEsoAAAAJ;b4Kj6MIAAAAJ;v-AEFIEAAAAJ;ey9AQcEAAAAJ;RhOpyXcAAAAJ;AOdxmJYAAAAJ", "orcid": ";;0000-0002-7894-4381;;;0000-0002-8698-202X;;0000-0001-5276-7156", "linkedin": ";;xinshuoweng;;;boris-ivanovic-a3103064;;", "or_profile": "~Thomas_Tian1;~Boyi_Li1;~Xinshuo_Weng3;~Edward_Schmerling1;~Yue_Wang2;~Boris_Ivanovic1;~Marco_Pavone1;~Yuxiao_Chen2", "aff": "University of California, Berkeley;University of California, Berkeley;NVIDIA;NVIDIA;NVIDIA;NVIDIA;Stanford University;California Institute of Technology", "aff_domain": "berkeley.edu;berkeley.edu;nvidia.com;nvidia.com;nvidia.com;nvidia.com;stanford.edu;caltech.edu", "position": "PhD student;Postdoc;Researcher;Researcher;Researcher;Researcher;Associate Professor;Postdoc", "bibtex": "@inproceedings{\ntian2024tokenize,\ntitle={Tokenize the World into Object-level Knowledge to Address Long-tail Events in Autonomous Driving},\nauthor={Thomas Tian and Boyi Li and Xinshuo Weng and Yuxiao Chen and Edward Schmerling and Yue Wang and Boris Ivanovic and Marco Pavone},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M0Gv07MUMU}\n}", "github": "", "project": "", "reviewers": "vv3a;gVTX;Ybio", "site": "https://openreview.net/forum?id=M0Gv07MUMU", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2648954572581006015&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;1;1;1;2;3", "aff_unique_norm": "University of California, Berkeley;NVIDIA;Stanford University;California Institute of Technology", "aff_unique_dep": ";NVIDIA Corporation;;", "aff_unique_url": "https://www.berkeley.edu;https://www.nvidia.com;https://www.stanford.edu;https://www.caltech.edu", "aff_unique_abbr": "UC Berkeley;NVIDIA;Stanford;Caltech", "aff_campus_unique_index": "0;0;2;3", "aff_campus_unique": "Berkeley;;Stanford;Pasadena", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "M0JtsLuhEE", "title": "T$^2$SQNet: A Recognition Model for Manipulating Partially Observed Transparent Tableware Objects", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recognizing and manipulating transparent tableware from partial view RGB image observations is made challenging by the difficulty in obtaining reliable depth measurements of transparent objects. In this paper we present the Transparent Tableware SuperQuadric Network (T$^2$SQNet), a neural network model that leverages a family of newly extended deformable superquadrics to produce low-dimensional, instance-wise and accurate 3D geometric representations of transparent objects from partial views. As a byproduct and contribution of independent interest, we also present TablewareNet, a publicly available toolset of seven parametrized shapes based on our extended deformable superquadrics, that can be used to generate new datasets of tableware objects of diverse shapes and sizes. Experiments with T$^2$SQNet trained with TablewareNet show that T$^2$SQNet outperforms existing methods in recognizing transparent objects, in some cases by significant margins, and can be effectively used in robotic applications like decluttering and target retrieval.", "keywords": "Transparent objects;Shape recognition;Object manipulation", "primary_area": "", "supplementary_material": "/attachment/e80a99dfd39e2c594f0a9f14d6306b41d996af0c.zip", "author": "Young Hun Kim;Seungyeon Kim;Yonghyeon Lee;Frank C. Park", "authorids": "~Young_Hun_Kim1;~Seungyeon_Kim2;~Yonghyeon_Lee2;~Frank_C._Park1", "gender": "M;M;M;M", "homepage": "https://github.com/yhun96;https://seungyeon-k.github.io/;https://www.gabe-yhlee.com;http://robotics.snu.ac.kr", "dblp": ";74/7997-3;182/6796;p/FrankChongwooPark", "google_scholar": ";https://scholar.google.com/citations?hl=en;;u-h3PJIAAAAJ", "orcid": ";0000-0001-6708-5684;;0000-0002-0293-6975", "linkedin": ";seungyeon-kim-45a20b263/;;", "or_profile": "~Young_Hun_Kim1;~Seungyeon_Kim2;~Yonghyeon_Lee2;~Frank_C._Park1", "aff": "Seoul National University;Seoul National University;Korea Institute for Advanced Study;Seoul National University", "aff_domain": "snu.ac.kr;snu.ac.kr;kias.re.kr;snu.ac.kr", "position": "PhD student;PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nkim2024tsqnet,\ntitle={T\\${\\textasciicircum}2\\${SQN}et: A Recognition Model for Manipulating Partially Observed Transparent Tableware Objects},\nauthor={Young Hun Kim and Seungyeon Kim and Yonghyeon Lee and Frank C. Park},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=M0JtsLuhEE}\n}", "github": "https://github.com/seungyeon-k/T2SQNet-public", "project": "", "reviewers": "3VBo;hYHE;p6K4", "site": "https://openreview.net/forum?id=M0JtsLuhEE", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:kzbbnf9N_LMJ:scholar.google.com/&scioq=T%24%5E2%24SQNet:+A+Recognition+Model+for+Manipulating+Partially+Observed+Transparent+Tableware+Objects&hl=en&as_sdt=0,5", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Seoul National University;Korea Institute for Advanced Study", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;http://www.kaist.edu", "aff_unique_abbr": "SNU;KIAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "MfIUKzihC8", "title": "CtRL-Sim: Reactive and Controllable Driving Agents with Offline Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Evaluating autonomous vehicle stacks (AVs) in simulation typically involves replaying driving logs from real-world recorded traffic. However, agents replayed from offline data are not reactive and hard to intuitively control. Existing approaches address these challenges by proposing methods that rely on heuristics or generative models of real-world data but these approaches either lack realism or necessitate costly iterative sampling procedures to control the generated behaviours. In this work, we take an alternative approach and propose CtRL-Sim, a method that leverages return-conditioned offline reinforcement learning to efficiently generate reactive and controllable traffic agents. Specifically, we process real-world driving data through a physics-enhanced Nocturne simulator to generate a diverse offline reinforcement learning dataset, annotated with various reward terms. We then train a return-conditioned multi-agent behaviour model that allows for fine-grained manipulation of agent behaviours by modifying the desired returns for the various reward components. This capability enables the generation of a wide range of driving behaviours beyond the scope of the initial dataset, including adversarial behaviours. We demonstrate that CtRL-Sim can generate diverse and realistic safety-critical scenarios while providing fine-grained control over agent behaviours.", "keywords": "offline reinforcement learning;autonomous driving;simulation", "primary_area": "", "supplementary_material": "/attachment/0df2437b4276a4ab8834406524c180595d1db532.zip", "author": "Luke Rowe;Roger Girgis;Anthony Gosselin;Bruno Carrez;Florian Golemo;Felix Heide;Liam Paull;Christopher Pal", "authorids": "~Luke_Rowe1;~Roger_Girgis1;~Anthony_Gosselin1;~Bruno_Carrez1;~Florian_Golemo1;~Felix_Heide2;~Liam_Paull1;~Christopher_Pal1", "gender": "M;M;M;M;M;;;", "homepage": "https://rluke22.github.io/;;;https://www.linkedin.com/in/bruno-carrez-1705bb4/;https://fgolemo.github.io/;https://www.cs.princeton.edu/~fheide/;;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ&hl=en&oi=ao", "dblp": "330/2836;;;;08/8643;01/9396;;45/1217", "google_scholar": "https://scholar.google.ca/citations?user=J-CXTg8AAAAJ;https://scholar.google.ca/citations?user=mBLay8oAAAAJ;https://scholar.google.ca/citations?hl=en;;https://scholar.google.de/citations?user=qvRf9xsAAAAJ;gRqzSHsAAAAJ;;https://scholar.google.ca/citations?user=1ScWJOoAAAAJ", "orcid": ";;;;0000-0001-9238-7764;;;", "linkedin": "rluke22/;;anthony-gosselin-098b7a1a1/;bruno-carrez-1705bb4/;;;;", "or_profile": "~Luke_Rowe1;~Roger_Girgis1;~Anthony_Gosselin1;~Bruno_Carrez1;~Florian_Golemo1;~Felix_Heide2;~Liam_Paull1;~Christopher_Pal1", "aff": "Universit\u00e9 de Montr\u00e9al;Mila - Quebec Artificial Intelligence Institute;Montreal Institute for Learning Algorithms, University of Montreal, Universit\u00e9 de Montr\u00e9al;Mila;Mila;Algolux;;Polytechnique Montreal", "aff_domain": "umontreal.ca;mila.quebec;mila.umontreal.ca;mila.quebec;mila.quebec;algolux.com;;polymtl.ca", "position": "PhD student;PhD student;MS student;Researcher;Postdoc;CTO;;Full Professor", "bibtex": "@inproceedings{\nrowe2024ctrlsim,\ntitle={Ct{RL}-Sim: Reactive and Controllable Driving Agents with Offline Reinforcement Learning},\nauthor={Luke Rowe and Roger Girgis and Anthony Gosselin and Bruno Carrez and Florian Golemo and Felix Heide and Liam Paull and Christopher Pal},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MfIUKzihC8}\n}", "github": "https://github.com/montrealrobotics/ctrl-sim/", "project": "", "reviewers": "KJao;xpFT;GaAn", "site": "https://openreview.net/forum?id=MfIUKzihC8", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10125900408549860261&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;3;4;5", "aff_unique_norm": "Universit\u00e9 de Montr\u00e9al;Quebec Artificial Intelligence Institute;University of Montreal;Mila;Algolux;Polytechnique Montreal", "aff_unique_dep": ";Artificial Intelligence;Montreal Institute for Learning Algorithms;Quebec Artificial Intelligence Institute;;", "aff_unique_url": "https://www.umontreal.ca;https://mila.quebec;https://www.mila.quebec;https://mila.quebec;https://www.algolux.com;https://www.polymtl.ca", "aff_unique_abbr": "UdeM;Mila;MILA;Mila;;PolyMTL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Montreal", "aff_country_unique_index": "0;0;0;0;0;1;0", "aff_country_unique": "Canada;Sweden" }, { "id": "MfuzopqVOX", "title": "LiDARGrid: Self-supervised 3D Opacity Grid from LiDAR for Scene Forecasting", "track": "main", "status": "Poster", "tldr": "", "abstract": "Timely capturing the dense geometry of the surrounding scene with unlabeled LiDAR data is valuable but under-explored for mobile robotic applications. Its value lies in the huge amount of such unlabeled data, enabling self-supervised learning for various downstream tasks. Current dynamic 3D scene reconstruction approaches however heavily rely on data annotations to tackle the moving objects in the scene. In response, we present LiDARGrid, a 3D opacity grid representation instantly derived from LiDAR points, which captures the dense 3D scene and facilitates scene forecasting. Our method features a novel self-supervised neural volume densification procedure based on an autoencoder and differentiable volume rendering. Leveraging this representation, self-supervised scene forecasting can be performed. Our method is trained on NuScenes dataset for autonomous driving, and is evaluated by predicting future point clouds using the scene forecasting. It notably outperforms state-of-the-art methods in point cloud forecasting in all performance metrics. Beyond scene forecasting, our representation excels in supporting additional tasks such as moving region detection and depth completion, as shown by experiments.", "keywords": "3D perception;lidar;opacity grid;occupancy grid;neural rendering;self-supervised learning;mobile robot;autonomous driving", "primary_area": "", "supplementary_material": "", "author": "Chuanyu Pan;Aolin Xu", "authorids": "~Chuanyu_Pan1;~Aolin_Xu1", "gender": "M;", "homepage": "https://pptrick.github.io/;", "dblp": ";", "google_scholar": "wNKoPGAAAAAJ;", "orcid": ";", "linkedin": "chuanyu-pan/;", "or_profile": "~Chuanyu_Pan1;~Aolin_Xu1", "aff": ";", "aff_domain": ";", "position": ";", "bibtex": "@inproceedings{\npan2024lidargrid,\ntitle={Li{DARG}rid: Self-supervised 3D Opacity Grid from Li{DAR} for Scene Forecasting},\nauthor={Chuanyu Pan and Aolin Xu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MfuzopqVOX}\n}", "github": "", "project": "", "reviewers": "rLH6;V6p6;8n74", "site": "https://openreview.net/forum?id=MfuzopqVOX", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:FzI9f-W2cZcJ:scholar.google.com/&scioq=LiDARGrid:+Self-supervised+3D+Opacity+Grid+from+LiDAR+for+Scene+Forecasting&hl=en&as_sdt=0,5", "gs_version_total": 0 }, { "id": "MsCbbIqHRA", "title": "ThinkGrasp: A Vision-Language System for Strategic Part Grasping in Clutter", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robotic grasping in cluttered environments remains a significant challenge due to occlusions and complex object arrangements. We have developed ThinkGrasp, a plug-and-play vision-language grasping system that makes use of GPT-4o's advanced contextual reasoning for grasping strategies. ThinkGrasp can effectively identify and generate grasp poses for target objects, even when they are heavily obstructed or nearly invisible, by using goal-oriented language to guide the removal of obstructing objects. This approach progressively uncovers the target object and ultimately grasps it with a few steps and a high success rate. In both simulated and real experiments, ThinkGrasp achieved a high success rate and significantly outperformed state-of-the-art methods in heavily cluttered environments or with diverse unseen objects, demonstrating strong generalization capabilities.", "keywords": "Robotic Grasping;Vision-Language Models;Language Conditioned Grasping", "primary_area": "", "supplementary_material": "/attachment/fffa2f32e0bce3dd7856074a3ae35b5afd2e0ffb.zip", "author": "Yaoyao Qian;Xupeng Zhu;Ondrej Biza;Shuo Jiang;Linfeng Zhao;Haojie Huang;Yu Qi;Robert Platt", "authorids": "~Yaoyao_Qian1;~Xupeng_Zhu1;~Ondrej_Biza1;~Shuo_Jiang1;~Linfeng_Zhao1;~Haojie_Huang1;~Yu_Qi4;~Robert_Platt1", "gender": "F;M;M;M;;M;F;", "homepage": "https://h-freax.github.io;https://zxp-s-works.github.io/;https://sites.google.com/view/obiza;;http://lfzhao.com;https://haojhuang.github.io/;https://github.com/yqi19;http://www.ccs.neu.edu/home/rplatt/", "dblp": "380/7236;257/4426;230/8616.html;;221/4652;144/2195;;39/5434", "google_scholar": "E0rCXLIAAAAJ;mwxz-8MAAAAJ;Gi9Xq8YAAAAJ;;;https://scholar.google.com/citations?hl=zh-CN;UZSbtlsAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;0000-0003-3390-8050;;;;;", "linkedin": ";xupengzhu-skunk;ond%C5%99ej-b%C3%AD%C5%BEa-a9405353/;shuo-jiang-7140b3137/;;;;", "or_profile": "~Yaoyao_Qian1;~Xupeng_Zhu1;~Ondrej_Biza1;~Shuo_Jiang1;~Linfeng_Zhao1;~Haojie_Huang1;~Yu_Qi4;~Robert_Platt1", "aff": "Northeastern University;Northeastern University;Northeastern University;;Meta;Northeastern University;Northeastern University;Northeastern University", "aff_domain": "neu.edu;northeastern.edu;northeastern.edu;;meta.com;northeastern.edu;northeastern.edu;neu.edu", "position": "MS student;PhD student;PhD student;;ML PhD Intern;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nqian2024thinkgrasp,\ntitle={ThinkGrasp: A Vision-Language System for Strategic Part Grasping in Clutter},\nauthor={Yaoyao Qian and Xupeng Zhu and Ondrej Biza and Shuo Jiang and Linfeng Zhao and Haojie Huang and Yu Qi and Robert Platt},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MsCbbIqHRA}\n}", "github": "https://github.com/H-Freax/ThinkGrasp", "project": "", "reviewers": "R6W6;ix3x;jQo9", "site": "https://openreview.net/forum?id=MsCbbIqHRA", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5347576988431916956&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "Northeastern University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.northeastern.edu;https://meta.com", "aff_unique_abbr": "NEU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "MwZJ96Okl3", "title": "Modeling Drivers\u2019 Situational Awareness from Eye Gaze for Driving Assistance", "track": "main", "status": "Poster", "tldr": "", "abstract": "Intelligent driving assistance can alert drivers to objects in their environment; however, such systems require a model of drivers' situational awareness (SA) (what aspects of the scene they are already aware of) to avoid unnecessary alerts. \nMoreover, collecting the data to train such an SA model is challenging: \nbeing an internal human cognitive state, driver SA is difficult to measure, and non-verbal signals such as eye gaze are some of the only outward manifestations of it. Traditional methods to obtain SA labels rely on probes that result in sparse, intermittent SA labels unsuitable for modeling a dense, temporally correlated process via machine learning. We propose a novel interactive labeling protocol that captures dense, continuous SA labels and use it to collect an object-level SA dataset in a VR driving simulator. Our dataset comprises 20 unique drivers' SA labels, driving data, and gaze (over 320 minutes of driving) which will be made public.\nAdditionally, we train an SA model from this data, formulating the object-level driver SA prediction problem as a semantic segmentation problem. Our formulation allows all objects in a scene at a timestep to be processed simultaneously, leveraging global scene context and local gaze-object relationships together.\nOur experiments show that this formulation leads to improved performance over common sense baselines and prior art on the SA prediction task.", "keywords": "driver awareness;driving assistance;situational awareness", "primary_area": "", "supplementary_material": "/attachment/90439beb4cce41d508ed86c1d97a121e5c06afeb.zip", "author": "Abhijat Biswas;Pranay Gupta;Shreeya Khurana;David Held;Henny Admoni", "authorids": "~Abhijat_Biswas1;~Pranay_Gupta1;srkhuran@andrew.cmu.edu;~David_Held1;~Henny_Admoni1", "gender": ";M;;M;", "homepage": "https://www.cs.cmu.edu/~abhijatb/;https://pranaygupta36.github.io;;http://davheld.github.io/;https://hennyadmoni.com", "dblp": ";;;22/11147;44/7075", "google_scholar": "NsV0tX8AAAAJ;;;0QtU-NsAAAAJ;XXiZaA4AAAAJ", "orcid": ";;;;", "linkedin": ";pranay-gupta-825713134/;;;", "or_profile": "~Abhijat_Biswas1;~Pranay_Gupta1;srkhuran@andrew.cmu.edu;~David_Held1;~Henny_Admoni1", "aff": "Carnegie Mellon University;Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;cmu.edu;;cmu.edu;cmu.edu", "position": "PhD student;MS student;;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nbiswas2024modeling,\ntitle={Modeling Drivers{\\textquoteright} Situational Awareness from Eye Gaze for Driving Assistance},\nauthor={Abhijat Biswas and Pranay Gupta and Shreeya Khurana and David Held and Henny Admoni},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MwZJ96Okl3}\n}", "github": "", "project": "", "reviewers": "NLkr;Qeo2;exse", "site": "https://openreview.net/forum?id=MwZJ96Okl3", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5226148497437490839&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "MyyZZAPgpy", "title": "SHADOW: Leveraging Segmentation Masks for Cross-Embodiment Policy Transfer", "track": "main", "status": "Poster", "tldr": "", "abstract": "Data collection in robotics is spread across diverse hardware, and this variation will increase as new hardware is developed. Effective use of this growing body of data requires methods capable of learning from diverse robot embodiments. We consider the setting of training a policy using expert trajectories from a single robot arm (the source), and evaluating on a different robot arm for which no data was collected (the target). We present a data editing scheme termed Shadow, in which the robot during training and evaluation is replaced with a composite segmentation mask of the source and target robots. In this way, the input data distribution at train and test time match closely, enabling robust policy transfer to the new unseen robot while being far more data efficient than approaches that require co-training on large amounts of data from diverse embodiments. We demonstrate that an approach as simple as Shadow is effective both in simulation on varying tasks and robots, and on real robot hardware, where Shadow demonstrates over 2x improvement in success rate compared to the strongest baseline.", "keywords": "Cross-embodiment learning;Imitation Learning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/76358074c653b30c5d64d85a3681ff0a73af784a.zip", "author": "Marion Lepert;Ria Doshi;Jeannette Bohg", "authorids": "~Marion_Lepert1;~Ria_Doshi1;~Jeannette_Bohg1", "gender": "F;;", "homepage": "https://marionlepert.github.io/;;https://web.stanford.edu/~bohg/", "dblp": "303/0652;;52/7377", "google_scholar": "Wp5ZuXgAAAAJ;;rjnJnEkAAAAJ", "orcid": ";;0000-0002-4921-7193", "linkedin": ";riadoshi;", "or_profile": "~Marion_Lepert1;~Ria_Doshi1;~Jeannette_Bohg1", "aff": "Stanford University;University of California, Berkeley;Stanford University", "aff_domain": "stanford.edu;berkeley.edu;stanford.edu", "position": "PhD student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nlepert2024shadow,\ntitle={{SHADOW}: Leveraging Segmentation Masks for Cross-Embodiment Policy Transfer},\nauthor={Marion Lepert and Ria Doshi and Jeannette Bohg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=MyyZZAPgpy}\n}", "github": "", "project": "", "reviewers": "Hf9g;PcxW;ASdD", "site": "https://openreview.net/forum?id=MyyZZAPgpy", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;2", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16427281927737155084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;0", "aff_unique_norm": "Stanford University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.berkeley.edu", "aff_unique_abbr": "Stanford;UC Berkeley", "aff_campus_unique_index": "0;1;0", "aff_campus_unique": "Stanford;Berkeley", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "N1K4B8N3n1", "title": "Scaling Safe Multi-Agent Control for Signal Temporal Logic Specifications", "track": "main", "status": "Poster", "tldr": "", "abstract": "Existing methods for safe multi-agent control using logic specifications like Signal Temporal Logic (STL) often face scalability issues. This is because they rely either on single-agent perspectives or on Mixed Integer Linear Programming (MILP)-based planners, which are complex to optimize. These methods have proven to be computationally expensive and inefficient when dealing with a large number of agents. To address these limitations, we present a new scalable approach to multi-agent control in this setting. Our method treats the relationships between agents using a graph structure rather than in terms of a single-agent perspective. Moreover, it combines a multi-agent collision avoidance controller with a Graph Neural Network (GNN) based planner, models the system in a decentralized fashion, and trains on STL-based objectives to generate safe and efficient plans for multiple agents, thereby optimizing the satisfaction of complex temporal specifications while also facilitating multi-agent collision avoidance. Our experiments show that our approach significantly outperforms existing methods that use a state-of-the-art MILP-based planner in terms of scalability and performance.", "keywords": "Multi-Robot Systems;Path Planning for Multiple Mobile Robots or Agents;Collision Avoidance;Hybrid Logical/Dynamical Planning and Verification;Deep Learning Methods", "primary_area": "", "supplementary_material": "/attachment/9be50bb7d0ffe90905e8d4c036a141401be48e67.zip", "author": "Joe Eappen;Zikang Xiong;Dipam Patel;Aniket Bera;Suresh Jagannathan", "authorids": "~Joe_Eappen2;~Zikang_Xiong1;~Dipam_Patel1;~Aniket_Bera1;~Suresh_Jagannathan1", "gender": "M;M;M;M;M", "homepage": "https://jeappen.github.io/;https://xiong.zikang.me;https://dipampatel.in/;http://cs.purdue.edu/~ab;http://www.cs.purdue.edu/homes/suresh", "dblp": "267/5377;https://dblp.uni-trier.de/pid/242/4529.html;;93/11476;j/SJagannathan.html", "google_scholar": "98R6dEQAAAAJ;H-EoAgYAAAAJ;lKI3gocAAAAJ;q3UdHk4AAAAJ;https://scholar.google.com/scholar?hl=en", "orcid": "0000-0001-9386-5545;;0000-0002-6082-4525;0000-0002-0182-6985;0000-0001-6871-2424", "linkedin": "jeappen/;;dnipatel/;;", "or_profile": "~Joe_Eappen2;~Zikang_Xiong1;~Dipam_Patel1;~Aniket_Bera1;~Suresh_Jagannathan1", "aff": "Purdue University;Purdue University;Purdue University;University of Maryland, College Park;", "aff_domain": "purdue.edu;purdue.edu;purdue.edu;umd.edu;", "position": "PhD student;PhD student;PhD student;Adjunct Associate Professor;", "bibtex": "@inproceedings{\neappen2024scaling,\ntitle={Scaling Safe Multi-Agent Control for Signal Temporal Logic Specifications},\nauthor={Joe Eappen and Zikang Xiong and Dipam Patel and Aniket Bera and Suresh Jagannathan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N1K4B8N3n1}\n}", "github": "https://github.com/jeappen/mastl-gcbf", "project": "", "reviewers": "uqbs;rCpB;HZ5o", "site": "https://openreview.net/forum?id=N1K4B8N3n1", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2205659342773477094&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Purdue University;University of Maryland", "aff_unique_dep": ";", "aff_unique_url": "https://www.purdue.edu;https://www/umd.edu", "aff_unique_abbr": "Purdue;UMD", "aff_campus_unique_index": "1", "aff_campus_unique": ";College Park", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "N5IS6DzBmL", "title": "Play to the Score: Stage-Guided Dynamic Multi-Sensory Fusion for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humans possess a remarkable talent for flexibly alternating to different senses when interacting with the environment. Picture a chef skillfully gauging the timing of ingredient additions and controlling the heat according to the colors, sounds, and aromas, seamlessly navigating through every stage of the complex cooking process. This ability is founded upon a thorough comprehension of task stages, as achieving the sub-goal within each stage can necessitate the utilization of different senses. In order to endow robots with similar ability, we incorporate the task stages divided by sub-goals into the imitation learning process to accordingly guide dynamic multi-sensory fusion. We propose MS-Bot, a stage-guided dynamic multi-sensory fusion method with coarse-to-fine stage understanding, which dynamically adjusts the priority of modalities based on the fine-grained state within the predicted current stage. We train a robot system equipped with visual, auditory, and tactile sensors to accomplish challenging robotic manipulation tasks: pouring and peg insertion with keyway. Experimental results indicate that our approach enables more effective and explainable dynamic fusion, aligning more closely with the human fusion process than existing methods.", "keywords": "Multi-Sensory;Robotic Manipulation;Multi-Stage", "primary_area": "", "supplementary_material": "/attachment/523a75a9876b440fc40eda87b9aa7aab57adfb22.zip", "author": "Ruoxuan Feng;Di Hu;Wenke Ma;Xuelong Li", "authorids": "~Ruoxuan_Feng1;~Di_Hu1;~Wenke_Ma1;~Xuelong_Li2", "gender": "M;M;M;M", "homepage": "https://xxuan01.github.io/;https://dtaoo.github.io/;;https://github.com/HANDS-FREE", "dblp": "339/6970;49/8496-1;l/XuelongLi;", "google_scholar": "https://scholar.google.com.hk/citations?user=Ma0FKqYAAAAJ;https://scholar.google.com.hk/citations?user=F7bvTOEAAAAJ;ahUibskAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Ruoxuan_Feng1;~Di_Hu1;~Xuelong_Li2;~wenkeMa1", "aff": "Renmin University of China;Renmin University of China;Northwestern Polytechnical University;Department of Computer Science, University of Massachusetts at Amherst", "aff_domain": "ruc.edu.cn;ruc.edu.cn;nwpu.edu.cn;cs.umass.edu", "position": "MS student;Associate Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nfeng2024play,\ntitle={Play to the Score: Stage-Guided Dynamic Multi-Sensory Fusion for Robotic Manipulation},\nauthor={Ruoxuan Feng and Di Hu and Wenke Ma and Xuelong Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=N5IS6DzBmL}\n}", "github": "https://github.com/GeWu-Lab/MS-Bot", "project": "", "reviewers": "rczR;mfxN;ConB", "site": "https://openreview.net/forum?id=N5IS6DzBmL", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;5;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12307254537853349947&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "Renmin University of China;Northwestern Polytechnical University;University of Massachusetts Amherst", "aff_unique_dep": ";;Department of Computer Science", "aff_unique_url": "http://www.ruc.edu.cn;https://www.nwpu.edu.cn;https://www.umass.edu", "aff_unique_abbr": "RUC;NWPU;UMass Amherst", "aff_campus_unique_index": "1", "aff_campus_unique": ";Amherst", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "China;United States" }, { "id": "NCnplCf4wo", "title": "Learning a Distributed Hierarchical Locomotion Controller for Embodied Cooperation", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this work, we propose a distributed hierarchical locomotion control strategy for whole-body cooperation and demonstrate the potential for migration into large numbers of agents. Our method utilizes a hierarchical structure to break down complex tasks into smaller, manageable sub-tasks. By incorporating spatiotemporal continuity features, we establish the sequential logic necessary for causal inference and cooperative behaviour in sequential tasks, thereby facilitating efficient and coordinated control strategies. Through training within this framework, we demonstrate enhanced adaptability and cooperation, leading to superior performance in task completion compared to the original methods. Moreover, we construct a set of environments as the benchmark for embodied cooperation.", "keywords": "Cooperation;Locomotion;Hierarchical reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/62113bbad3c43bde14b1e0052168a9042947422d.zip", "author": "Chuye Hong;Kangyao Huang;Huaping Liu", "authorids": "~Chuye_Hong1;~Kangyao_Huang1;~Huaping_Liu3", "gender": "M;;M", "homepage": ";https://kangyao-huang.tech/;https://sites.google.com/site/thuliuhuaping/", "dblp": ";;69/1097-1", "google_scholar": ";https://scholar.google.com.hk/citations?user=df7fnwQAAAAJ;https://scholar.google.com.hk/citations?user=HXnkIkwAAAAJ", "orcid": "0009-0005-4679-2212;;", "linkedin": ";;", "or_profile": "~Chuye_Hong1;~Kangyao_Huang1;~Huaping_Liu3", "aff": "Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "mails.tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "Undergrad student;PhD student;Full Professor", "bibtex": "@inproceedings{\nhong2024learning,\ntitle={Learning a Distributed Hierarchical Locomotion Controller for Embodied Cooperation},\nauthor={Chuye Hong and Kangyao Huang and Huaping Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NCnplCf4wo}\n}", "github": "", "project": "", "reviewers": "gDjg;Mqnz;pYHV", "site": "https://openreview.net/forum?id=NCnplCf4wo", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;5;3", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8461688058015276171&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "NiA8hVdDS7", "title": "RoboKoop: Efficient Control Conditioned Representations from Visual Input in Robotics using Koopman Operator", "track": "main", "status": "Poster", "tldr": "", "abstract": "Developing agents that can perform complex control tasks from high-dimensional observations is a core ability of autonomous agents that requires underlying robust task control policies and adapting the underlying visual representations to the task. Most existing policies need a lot of training samples and treat this problem from the lens of two-stage learning with a controller learned on top of pre-trained vision models. We approach this problem from the lens of Koopman theory and learn visual representations from robotic agents conditioned on specific downstream tasks in the context of learning stabilizing control for the agent. We introduce a Contrastive Spectral Koopman Embedding network that allows us to learn efficient linearized visual representations from the agent's visual data in a high dimensional latent space and utilizes reinforcement learning to perform off-policy control on top of the extracted representations with a linear controller. Our method enhances stability and control in gradient dynamics over time, significantly outperforming existing approaches by improving efficiency and accuracy in learning task policies over extended horizons.", "keywords": "Feature extraction;Task Feedback;Control", "primary_area": "", "supplementary_material": "/attachment/bc4ffb3d2ce6cc0886f7510ec87037527709d169.zip", "author": "Hemant Kumawat;Biswadeep Chakraborty;Saibal Mukhopadhyay", "authorids": "~Hemant_Kumawat1;~Biswadeep_Chakraborty1;~Saibal_Mukhopadhyay2", "gender": "M;M;M", "homepage": "https://hemantkumawat.com/;;https://greenlab.ece.gatech.edu", "dblp": "319/0195;238/0554;66/1210", "google_scholar": "2iUnwBwAAAAJ;8soIjY8AAAAJ;5KRtMEkAAAAJ", "orcid": ";;0000-0002-8894-3390", "linkedin": "kumawathemant/;;", "or_profile": "~Hemant_Kumawat1;~Biswadeep_Chakraborty1;~Saibal_Mukhopadhyay2", "aff": "Georgia Institute of Technology;Georgia Institute of Technology;Georgia Institute of Technology", "aff_domain": "gatech.edu;gatech.edu;gatech.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nkumawat2024robokoop,\ntitle={RoboKoop: Efficient Control Conditioned Representations from Visual Input in Robotics using Koopman Operator},\nauthor={Hemant Kumawat and Biswadeep Chakraborty and Saibal Mukhopadhyay},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=NiA8hVdDS7}\n}", "github": "", "project": "", "reviewers": "5VsF;n623;eUws;MzWx", "site": "https://openreview.net/forum?id=NiA8hVdDS7", "pdf_size": 0, "rating": "2;2;3;3", "confidence": "3;4;4;4", "rating_avg": 2.5, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1321971353451471846&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Georgia Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://www.gatech.edu", "aff_unique_abbr": "Georgia Tech", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "O05tIQt2d5", "title": "TOP-Nav: Legged Navigation Integrating Terrain, Obstacle and Proprioception Estimation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Legged navigation is typically examined within open-world, off-road, and challenging environments. In these scenarios, estimating external disturbances requires a complex synthesis of multi-modal information. This underlines a major limitation in existing works that primarily focus on avoiding obstacles. In this work, we propose TOP-Nav, a novel legged navigation framework that integrates a comprehensive path planner with Terrain awareness, Obstacle avoidance and close-loop Proprioception. TOP-Nav underscores the synergies between vision and proprioception in both path and motion planning. Within the path planner, we present a terrain estimator that enables the robot to select waypoints on terrains with higher traversability while effectively avoiding obstacles. In the motion planning level, we construct a proprioception advisor from the learning-based locomotion controller to provide motion evaluations for the path planner. Based on the close-loop motion feedback, we offer online corrections for the vision-based terrain and obstacle estimations. Consequently, TOP-Nav achieves open-world navigation that the robot can handle terrains or disturbances beyond the distribution of prior knowledge and overcomes constraints imposed by visual conditions. Building upon extensive experiments conducted in both simulation and real-world environments, TOP-Nav demonstrates superior performance in open-world navigation compared to existing methods.", "keywords": "Navigation;Task Planning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/d276b9c99edb409a38df56cc742bb099d3c5416a.zip", "author": "Junli Ren;Yikai Liu;Yingru Dai;Junfeng Long;Guijin Wang", "authorids": "~Junli_Ren1;~Yikai_Liu2;~Yingru_Dai1;~Junfeng_Long1;~Guijin_Wang1", "gender": "M;M;;M;M", "homepage": ";http://wikim.ee.tsinghua.edu.cn/display/~1343024778;;https://junfeng-long.github.io/;http://web.ee.tsinghua.edu.cn/wangguijin/zh_CN/index/2769/list/index.htm", "dblp": ";;;343/2990;37/6836", "google_scholar": ";;;olmfqBEAAAAJ;qDjozE4AAAAJ", "orcid": "0000-0003-1288-2680;;;0000-0001-7047-4963;", "linkedin": ";;;;", "or_profile": "~Junli_Ren1;~Yikai_Liu2;~Yingru_Dai1;~Junfeng_Long1;~Guijin_Wang1", "aff": "University of Hong Kong;;;Shanghai AI Laboratory;Department of Electronic Engineering, Tsinghua University", "aff_domain": "hku.hk;;;pjlab.org.cn;tsinghua.edu.cn", "position": "PhD student;;;Researcher;Full Professor", "bibtex": "@inproceedings{\nren2024topnav,\ntitle={{TOP}-Nav: Legged Navigation Integrating Terrain, Obstacle and Proprioception Estimation},\nauthor={Junli Ren and Yikai Liu and Yingru Dai and Junfeng Long and Guijin Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O05tIQt2d5}\n}", "github": "", "project": "", "reviewers": "VJBQ;BuXi;CGbG", "site": "https://openreview.net/forum?id=O05tIQt2d5", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12765786386208647540&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Hong Kong;Shanghai AI Laboratory;Tsinghua University", "aff_unique_dep": ";;Department of Electronic Engineering", "aff_unique_url": "https://www.hku.hk;https://www.shanghai-ai-lab.com;https://www.tsinghua.edu.cn", "aff_unique_abbr": "HKU;SAIL;THU", "aff_campus_unique_index": "0", "aff_campus_unique": "Hong Kong SAR;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "O0oK2bVist", "title": "Adapting Humanoid Locomotion over Challenging Terrain via Two-Phase Training", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humanoid robots are a key focus in robotics, with their capacity to navigate tough terrains being essential for many uses. While strides have been made, creating adaptable locomotion for complex environments is still tough. Recent progress in learning-based systems offers hope for robust legged locomotion, but challenges persist, such as tracking accuracy at high speeds and on uneven ground, and joint oscillations in actual robots.\n This paper proposes a novel training framework to address these challenges by employing a two-phase training paradigm with reinforcement learning. The proposed framework is further enhanced through the integration of command curriculum learning, refining the precision and adaptability of our approach. Additionally, we adapt DreamWaQ to our humanoid locomotion system and improve it to mitigate joint oscillations. Finally, we achieve the sim-to-real transfer of our method. A series of empirical results demonstrate the superior performance of our proposed method compared to state-of-the-art methods.", "keywords": "humanoid robots;locomotion;reinforcement learning;curriculum;sim-to-real", "primary_area": "", "supplementary_material": "/attachment/9d97097b3a83c622c1d8abaa8e782231ec3904be.zip", "author": "Wenhao Cui;Shengtao Li;Huaxing Huang;Bangyu Qin;Tianchu Zhang;hanjinchao;Liang Zheng;Ziyang Tang;Chenxu Hu;NING Yan;Jiahao Chen;Zheyuan Jiang", "authorids": "~Wenhao_Cui2;~Shengtao_Li3;~Huaxing_Huang1;~Bangyu_Qin1;~Tianchu_Zhang1;~hanjinchao1;~Liang_Zheng7;~Ziyang_Tang3;~Chenxu_Hu1;~NING_Yan2;~Jiahao_Chen14;~Zheyuan_Jiang1", "gender": "M;M;M;M;F;;M;M;M;;M;M", "homepage": ";https://github.com/listao0818;;https://github.com/bangyu95;https://github.com/Chortine;https://github.com/jchzylayg;https://noetixrobotics.com/;https://github.com/Jackie-Tang1/;https://huchenxucs.github.io/;;https://faculty.sist.shanghaitech.edu.cn/chenjh/;https://merlinjiang.github.io/", "dblp": ";;;;;;;;222/6365;;;", "google_scholar": ";;https://scholar.google.com/citations?view_op=list_works;;;;;;4LzKZggAAAAJ;;A13oCMQAAAAJ;", "orcid": ";;;;;;;;;;0000-0002-8927-5646;", "linkedin": "wenhao-cui-066b4a175/;;;;;;;;;;jiahao-chen/;", "or_profile": "~Wenhao_Cui2;~Shengtao_Li3;~Huaxing_Huang1;~Bangyu_Qin1;~Tianchu_Zhang1;~hanjinchao1;~Liang_Zheng7;~Ziyang_Tang3;~Chenxu_Hu1;~NING_Yan2;~Jiahao_Chen14;~Zheyuan_Jiang1", "aff": "University of Southern California;North University of China;Noetic Robotics;Shanghai Jiaotong University;Noetix Robotics;Noetic Robotics;University of Electronic Science and Technology of China;State University of New York at Stony Brook;Tsinghua University;;ShanghaiTech University;Institute for Interdisciplinary Information Sciences, Tsinghua University, Tsinghua University", "aff_domain": "usc.edu;mail.st.nuc.edu.cn;noetixrobotics.com;sjtu.edu.cn;noetixrobotics.com;noetixrobotics.com;uestc.edu;stonybrook.edu;tsinghua.edu.cn;;shanghaitech.edu.cn;mails.tsinghua.edu.cn", "position": "MS student;MS student;Researcher;MS student;Researcher;Researcher;MS student;MS student;PhD student;;Assistant Professor;PhD student", "bibtex": "@inproceedings{\ncui2024adapting,\ntitle={Adapting Humanoid Locomotion over Challenging Terrain via Two-Phase Training},\nauthor={Wenhao Cui and Shengtao Li and Huaxing Huang and Bangyu Qin and Tianchu Zhang and hanjinchao and Liang Zheng and Ziyang Tang and Chenxu Hu and NING Yan and Jiahao Chen and Zheyuan Jiang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=O0oK2bVist}\n}", "github": "", "project": "", "reviewers": "rN9h;61BH;RdfP", "site": "https://openreview.net/forum?id=O0oK2bVist", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 12, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10678264963598995821&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;2;5;6;7;8;7", "aff_unique_norm": "University of Southern California;North University of China;Noetic Robotics;Shanghai Jiao Tong University;Noetix Robotics;University of Electronic Science and Technology of China;State University of New York at Stony Brook;Tsinghua University;ShanghaiTech University", "aff_unique_dep": ";;;;;;;;", "aff_unique_url": "https://www.usc.edu;http://www.nuc.edu.cn;;https://www.sjtu.edu.cn;;https://www.uestc.edu.cn;https://www.stonybrook.edu;https://www.tsinghua.edu.cn;https://www.shanghaitech.edu.cn", "aff_unique_abbr": "USC;NUC;;SJTU;;UESTC;SUNY Stony Brook;THU;ShanghaiTech", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Los Angeles;;Stony Brook", "aff_country_unique_index": "0;1;0;1;0;0;1;0;1;1;1", "aff_country_unique": "United States;China" }, { "id": "OGjGtN6hoo", "title": "Adaptive Language-Guided Abstraction from Contrastive Explanations", "track": "main", "status": "Poster", "tldr": "", "abstract": "Many approaches to robot learning begin by inferring a reward function from a set of human demonstrations.\nTo learn a good reward, it is necessary to determine which features of the environment are relevant before determining how these features should be used to compute reward.\nIn particularly complex, high-dimensional environments, human demonstrators often struggle to fully specify their desired behavior from a small number of demonstrations.\nEnd-to-end reward learning methods (e.g., using deep networks or program synthesis techniques) often yield brittle reward functions that are sensitive to spurious state features.\nBy contrast, humans can often generalizably learn from a small number of demonstrations by incorporating strong priors about what features of a demonstration are likely meaningful for a task of interest. \nHow do we build robots that leverage this kind of background knowledge when learning from new demonstrations?\nThis paper describes a method named ALGAE which alternates between using language models to iteratively identify human-meaningful features needed to explain demonstrated behavior, then standard inverse reinforcement learning techniques to assign weights to these features.\nExperiments across a variety of both simulated and real-world robot environments show that ALGAElearns generalizable reward functions defined on interpretable features using only small numbers of demonstrations.\nImportantly, ALGAE can recognize when features are missing, then extract and define those features without any human input -- making it possible to quickly and efficiently acquire rich representations of user behavior.", "keywords": "reward learning;language-guided abstraction;reward features", "primary_area": "", "supplementary_material": "/attachment/f846fbe2d170512ca5083aec675cf29cb6698635.zip", "author": "Andi Peng;Belinda Z. Li;Ilia Sucholutsky;Nishanth Kumar;Julie Shah;Jacob Andreas;Andreea Bobu", "authorids": "~Andi_Peng1;~Belinda_Z._Li1;~Ilia_Sucholutsky1;~Nishanth_Kumar1;~Julie_Shah2;~Jacob_Andreas1;~Andreea_Bobu1", "gender": "F;M;M;F;M;F;F", "homepage": "https://andipeng.com/;https://ilia10000.github.io/;http://nishanthjkumar.com/;https://interactive.mit.edu;http://web.mit.edu/jda/www;https://andreea7b.github.io/;https://belindal.github.io/", "dblp": "242/9185;239/5108;211/7595;;97/8154;187/2860;263/9914", "google_scholar": "S63gb38AAAAJ;https://scholar.google.ca/citations?user=6MfHyuMAAAAJ;FE512o4AAAAJ;;dnZ8udEAAAAJ;62e5CygAAAAJ;700fyvEAAAAJ", "orcid": ";0000-0003-4121-7479;0000-0001-9291-3728;;;0000-0002-9507-7427;", "linkedin": ";iliasu/;nishanth-kumar;;;andreea-bobu-a2940277/;", "or_profile": "~Andi_Peng1;~Ilia_Sucholutsky1;~Nishanth_Kumar1;~Julie_Shah2;~Jacob_Andreas1;~Andreea_Bobu1;~Belinda_Zou_Li1", "aff": "Massachusetts Institute of Technology;Princeton University;The AI Institute;Massachusetts Institute of Technology;Microsoft;The AI Institute;Massachusetts Institute of Technology", "aff_domain": "mit.edu;princeton.edu;theaiinstitute.com;mit.edu;microsoft.com;theaiinstitute.com;mit.edu", "position": "PhD student;Postdoc;Intern;Professor;Researcher;Researcher;PhD student", "bibtex": "@inproceedings{\npeng2024adaptive,\ntitle={Adaptive Language-Guided Abstraction from Contrastive Explanations},\nauthor={Andi Peng and Belinda Z. Li and Ilia Sucholutsky and Nishanth Kumar and Julie Shah and Jacob Andreas and Andreea Bobu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OGjGtN6hoo}\n}", "github": "", "project": "", "reviewers": "cr8T;vt5Y;sFkR", "site": "https://openreview.net/forum?id=OGjGtN6hoo", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10794021499273686387&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;0;3;2;0", "aff_unique_norm": "Massachusetts Institute of Technology;Princeton University;AI Institute;Microsoft", "aff_unique_dep": ";;;Microsoft Corporation", "aff_unique_url": "https://web.mit.edu;https://www.princeton.edu;;https://www.microsoft.com", "aff_unique_abbr": "MIT;Princeton;;Microsoft", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Oce2215aJE", "title": "Body Transformer: Leveraging Robot Embodiment for Policy Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "In recent years, the transformer architecture has become the de-facto standard for machine learning algorithms applied to natural language processing and computer vision. Despite notable evidence of successful deployment of this architecture in the context of robot learning, we claim that vanilla transformers do not fully exploit the structure of the robot learning problem. We propose Body Transformer (BoT), an architecture that exploits the robot embodiment by providing an inductive bias that guides the learning process. We represent the robot body as a graph of sensors and actuators, and rely on masked attention to pool information through the architecture. The resulting architecture outperforms the vanilla transformer, as well as the classical multilayer perceptron, with respect to task completion, scaling properties, and computational efficiency when representing either imitation or reinforcement learning policies.", "keywords": "Robot Learning;Graph Neural Networks;Imitation Learning;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/b0dc20c7648624720a84fecee640f61b5652a8c4.zip", "author": "Carmelo Sferrazza;Dun-Ming Huang;Fangchen Liu;Jongmin Lee;Pieter Abbeel", "authorids": "~Carmelo_Sferrazza1;~Dun-Ming_Huang1;~Fangchen_Liu2;~Jongmin_Lee1;~Pieter_Abbeel2", "gender": ";M;F;M;M", "homepage": "https://sferrazza.cc;https://bransthre.github.io/;https://fangchenliu.github.io/;https://www.jmlee.kr;https://people.eecs.berkeley.edu/~pabbeel/", "dblp": "190/8406;;;68/222-4.html;", "google_scholar": "x0_lwNYAAAAJ;;;https://scholar.google.co.kr/citations?user=rFcK8EEAAAAJ;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ", "orcid": "0000-0002-7432-7634;;;;", "linkedin": "csferrazza/;;;jmlee123/;", "or_profile": "~Carmelo_Sferrazza1;~Dun-Ming_Huang1;~Fangchen_Liu2;~Jongmin_Lee1;~Pieter_Abbeel2", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Covariant", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;berkeley.edu;covariant.ai", "position": "Postdoc;Undergrad student;PhD student;Postdoc;Founder", "bibtex": "@inproceedings{\nsferrazza2024body,\ntitle={Body Transformer: Leveraging Robot Embodiment for Policy Learning},\nauthor={Carmelo Sferrazza and Dun-Ming Huang and Fangchen Liu and Jongmin Lee and Pieter Abbeel},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Oce2215aJE}\n}", "github": "https://github.com/carlosferrazza/BodyTransformer", "project": "", "reviewers": "tLjG;W7Gq;f458", "site": "https://openreview.net/forum?id=Oce2215aJE", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11929031293013873433&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Covariant", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;", "aff_unique_abbr": "UC Berkeley;", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "OznnnxPLiH", "title": "JointMotion: Joint Self-Supervision for Joint Motion Prediction", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present JointMotion, a self-supervised pre-training method for joint motion prediction in self-driving vehicles. Our method jointly optimizes a scene-level objective connecting motion and environments, and an instance-level objective to refine learned representations. Scene-level representations are learned via non-contrastive similarity learning of past motion sequences and environment context. At the instance level, we use masked autoencoding to refine multimodal polyline representations. We complement this with an adaptive pre-training decoder that enables JointMotion to generalize across different environment representations, fusion mechanisms, and dataset characteristics. Notably, our method reduces the joint final displacement error of Wayformer, HPTR, and Scene Transformer models by 3%, 8%, and 12%, respectively; and enables transfer learning between the Waymo Open Motion and the Argoverse 2 Motion Forecasting datasets.", "keywords": "Self-supervised learning;representation learning;multimodal pre-training;motion prediction;data-efficient learning", "primary_area": "", "supplementary_material": "", "author": "Royden Wagner;Omer Sahin Tas;Marvin Klemp;Carlos Fernandez", "authorids": "~Royden_Wagner1;~Omer_Sahin_Tas1;~Marvin_Klemp1;~Carlos_Fernandez1", "gender": "M;;M;", "homepage": ";https://www.omersahintas.com;https://www.linkedin.com/in/marvin-klemp-371428201/;https://www.mrt.kit.edu", "dblp": "318/1159;122/3330.html;;79/11147.html", "google_scholar": "tvltjqQAAAAJ;https://scholar.google.de/citations?user=opaVrnQAAAAJ;ZOCUUfoAAAAJ;OIF2_EMAAAAJ", "orcid": ";;;0000-0002-0417-6762", "linkedin": "royden-wagner-35843919b;;;", "or_profile": "~Royden_Wagner1;~Omer_Sahin_Tas1;~Marvin_Klemp1;~Carlos_Fernandez1", "aff": "Karlsruhe Institute of Technology;FZI Research Center for Information Technology;Karlsruhe Institute of Technology;Karlsruher Institut f\u00fcr Technologie", "aff_domain": "kit.edu;fzi.de;kit.edu;kit.edu", "position": "PhD student;Research Scientist;PhD student;Postdoc", "bibtex": "@inproceedings{\nwagner2024jointmotion,\ntitle={JointMotion: Joint Self-Supervision for Joint Motion Prediction},\nauthor={Royden Wagner and Omer Sahin Tas and Marvin Klemp and Carlos Fernandez},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=OznnnxPLiH}\n}", "github": "https://github.com/kit-mrt/future-motion", "project": "", "reviewers": "XhvA;ZYgU;ffHe", "site": "https://openreview.net/forum?id=OznnnxPLiH", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13579644145748203094&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Karlsruhe Institute of Technology;FZI Research Center for Information Technology;Karlsruher Institut f\u00fcr Technologie", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kit.edu;https://www.fzi.de;https://www.kit.edu", "aff_unique_abbr": "KIT;FZI;KIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "Germany" }, { "id": "PAtsxVz0ND", "title": "ScissorBot: Learning Generalizable Scissor Skill for Paper Cutting via Simulation, Imitation, and Sim2Real", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper tackles the challenging robotic task of generalizable paper cutting using scissors. \nIn this task, scissors attached to a robot arm are driven to accurately cut curves drawn on the paper, which is hung with the top edge fixed. \nDue to the frequent paper-scissor contact and consequent fracture, the paper features continual deformation and changing topology, which is diffult for accurate modeling.To deal with such versatile scenarios, we propose ScissorBot, the first learning-based system for robotic paper cutting with scissors via simulation, imitation learning and sim2real. Given the lack of sufficient data for this task, we build PaperCutting-Sim, a paper simulator supporting interactive fracture coupling with scissors, enabling demonstration generation with a heuristic-based oracle policy. \nTo ensure effective execution, we customize an action primitive sequence for imitation learning to constrain its action space, thus alleviating potential compounding errors.\nFinally, by integrating sim-to-real techniques to bridge the gap between simulation and reality, our policy can be effectively deployed on the real robot.\nExperimental results demonstrate that our method surpasses all baselines in both simulation and real-world benchmarks and achives performance comparable to human operation with a single hand under the same conditions.", "keywords": "Deformable Object Manipulation;Imitation Learning;Sim-to-Real", "primary_area": "", "supplementary_material": "/attachment/2003474de3f513f90da0595c5f1f7d6cb7337561.zip", "author": "Jiangran Lyu;Yuxing Chen;Tao Du;Feng Zhu;Huiquan Liu;Yizhou Wang;He Wang", "authorids": "~Jiangran_Lyu2;~Yuxing_Chen3;~Tao_Du1;~Feng_Zhu12;~Huiquan_Liu1;~Yizhou_Wang1;~He_Wang5", "gender": "M;M;;M;M;M;M", "homepage": "https://jiangranlv.github.io/;https://chen01yx.github.io/;https://people.iiis.tsinghua.edu.cn/~taodu/;https://github.com/jzzhufeng;https://yz.nwafu.edu.cn/xydsfc/zwbhxy/722f8c0727c54c04a032012f9371f610.htm;https://cfcs.pku.edu.cn/wangyizhou/;https://hughw19.github.io", "dblp": ";;51/3026-1;;;71/3387-1;01/6368-10", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;;831z_VcAAAAJ;roCAWkoAAAAJ", "orcid": ";;0000-0001-7337-7667;;;;", "linkedin": ";;;;;;", "or_profile": "~Jiangran_Lyu2;~Yuxing_Chen3;~Tao_Du1;~Feng_Zhu12;~Huiquan_Liu1;~Yizhou_Wang1;~He_Wang5", "aff": "Peking University;Peking University;Shanghai Qi Zhi Institute;;University of Electronic Science and Technology of China;Peking University;Peking University", "aff_domain": "stu.pku.edu.cn;stu.pku.edu.cn;sqz.ac.cn;;uestc.edu.cn;pku.edu.cn;pku.edu.cn", "position": "PhD student;Undergrad student;Principal investigator;;Researcher;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nlyu2024scissorbot,\ntitle={ScissorBot: Learning Generalizable Scissor Skill for Paper Cutting via Simulation, Imitation, and Sim2Real},\nauthor={Jiangran Lyu and Yuxing Chen and Tao Du and Feng Zhu and Huiquan Liu and Yizhou Wang and He Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PAtsxVz0ND}\n}", "github": "", "project": "", "reviewers": "2CTS;n7FW;Wc2M", "site": "https://openreview.net/forum?id=PAtsxVz0ND", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1706479238685734135&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "Peking University;Shanghai Qi Zhi Institute;University of Electronic Science and Technology of China", "aff_unique_dep": ";;", "aff_unique_url": "http://www.pku.edu.cn;https://www.qz.io;https://www.uestc.edu.cn", "aff_unique_abbr": "Peking U;;UESTC", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "PbQOZntuXO", "title": "One Policy to Run Them All: an End-to-end Learning Approach to Multi-Embodiment Locomotion", "track": "main", "status": "Poster", "tldr": "", "abstract": "Deep Reinforcement Learning techniques are achieving state-of-the-art results in robust legged locomotion.\nWhile there exists a wide variety of legged platforms such as quadruped, humanoids, and hexapods, the field is still missing a single learning framework that can control all these different embodiments easily and effectively and possibly transfer, zero or few-shot, to unseen robot embodiments.\nTo close this gap, we introduce URMA, the Unified Robot Morphology Architecture. Our framework brings the end-to-end Multi-Task Reinforcement Learning approach to the realm of legged robots, enabling the learned policy to control any type of robot morphology.\nThe key idea of our method is to allow the network to learn an abstract locomotion controller that can be seamlessly shared between embodiments thanks to our morphology-agnostic encoders and decoders. This flexible architecture can be seen as a first step in building a foundation model for legged robot locomotion.\nOur experiments show that URMA can learn a locomotion policy on multiple embodiments that can be easily transferred to unseen robot platforms in simulation and the real world.", "keywords": "Locomotion;Reinforcement Learning;Multi-embodiment Learning", "primary_area": "", "supplementary_material": "/attachment/0e07d7c4aec25d5f53e05f2a93bb39ccff8ab039.zip", "author": "Nico Bohlinger;Grzegorz Czechmanowski;Maciej Piotr Krupka;Piotr Kicki;Krzysztof Walas;Jan Peters;Davide Tateo", "authorids": "~Nico_Bohlinger1;~Grzegorz_Czechmanowski1;~Maciej_Piotr_Krupka1;~Piotr_Kicki1;~Krzysztof_Walas2;~Jan_Peters3;~Davide_Tateo2", "gender": "M;M;M;M;M;M;M", "homepage": "https://www.ias.informatik.tu-darmstadt.de/Team/NicoBohlinger;;;;https://ideas-ncbr.pl/en/osoby/krzysztof-walas/;https://www.jan-peters.net;https://www.ias.informatik.tu-darmstadt.de/Team/DavideTateo", "dblp": ";;;234/2595;05/9858.html;p/JanPeters1;214/0808", "google_scholar": "5SBR9tEAAAAJ;;;tilnVjMAAAAJ;0FZ0cZQAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;https://scholar.google.it/citations?user=LGnu3SEAAAAJ", "orcid": ";0009-0002-7199-2492;;;0000-0002-2800-2716;0000-0002-5266-8091;0000-0002-7193-923X", "linkedin": ";;maciej-krupka-91a41a212/;;krzysztof-walas-850492a7/;janrpeters/;", "or_profile": "~Nico_Bohlinger1;~Grzegorz_Czechmanowski1;~Maciej_Piotr_Krupka1;~Piotr_Kicki1;~Krzysztof_Walas2;~Jan_Peters3;~Davide_Tateo2", "aff": "Technische Universit\u00e4t Darmstadt;Technical University of Poznan;Technical University of Poznan;IDEAS NCBR Sp.;Technical University of Poznan;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tu-darmstadt.de;put.poznan.pl;put.poznan.pl;ideas-ncbr.pl;put.poznan.pl;tu-darmstadt.de;tu-darmstadt.de", "position": "PhD student;MS student;MS student;Postdoc;Assistant Professor;Full Professor;Researcher", "bibtex": "@inproceedings{\nbohlinger2024one,\ntitle={One Policy to Run Them All: an End-to-end Learning Approach to Multi-Embodiment Locomotion},\nauthor={Nico Bohlinger and Grzegorz Czechmanowski and Maciej Piotr Krupka and Piotr Kicki and Krzysztof Walas and Jan Peters and Davide Tateo},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=PbQOZntuXO}\n}", "github": "https://github.com/nico-bohlinger/one_policy_to_run_them_all", "project": "", "reviewers": "xWHF;bzcE;kaLC", "site": "https://openreview.net/forum?id=PbQOZntuXO", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3132988324242974998&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;1;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt;Technical University of Poznan;IDEAS NCBR", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tu-darmstadt.de;https://www.put.poznan.pl/;", "aff_unique_abbr": "TUD;PUT;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;1;1;1;1;0;0", "aff_country_unique": "Germany;Poland" }, { "id": "Q2lGXMZCv8", "title": "LLARVA: Vision-Action Instruction Tuning Enhances Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "In recent years, instruction-tuned Large Multimodal Models (LMMs) have been successful at several tasks, including image captioning and visual question answering; yet leveraging these models remains an open question for robotics. Prior LMMs for robotics applications have been extensively trained on language and action data, but their ability to generalize in different settings has often been less than desired. To address this, we introduce LLARVA, a model trained with a novel instruction tuning method that leverages structured prompts to unify a range of robotic learning tasks, scenarios, and environments. Additionally, we show that predicting intermediate 2-D representations, which we refer to as *visual traces*, can help further align vision and action spaces for robot learning. We generate 8.5M image-visual trace pairs from the Open X-Embodiment dataset in order to pre-train our model, and we evaluate on 12 different tasks in the RLBench simulator as well as a physical Franka Emika Panda 7-DoF robot. Our experiments yield strong performance, demonstrating that LLARVA \u2014 using 2-D and language representations \u2014 performs well compared to several contemporary baselines, and can generalize across various robot environments and configurations.", "keywords": "LMMs;Vision Action Instruction Tuning;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/322e279e5d2f7e09c2ff86b580ee37d7cf352373.zip", "author": "Dantong Niu;Yuvan Sharma;Giscard Biamby;Jerome Quenum;Yutong Bai;Baifeng Shi;Trevor Darrell;Roei Herzig", "authorids": "~Dantong_Niu1;~Yuvan_Sharma1;~Giscard_Biamby1;~Jerome_Quenum1;~Yutong_Bai1;~Baifeng_Shi1;~Trevor_Darrell2;~Roei_Herzig2", "gender": "M;M;;F;;M;F;M", "homepage": "https://github.com/yuvansharma;;https://people.eecs.berkeley.edu/~jquenum/;https://yutongbai.com/;https://bfshi.github.io;https://roeiherz.github.io/;https://dantong88.github.io/;https://people.eecs.berkeley.edu/~trevor/", "dblp": ";251/1808;;216/8431;261/9376;215/5165;299/1393;d/TrevorDarrell", "google_scholar": ";https://scholar.google.com/citations?hl=en;mPP-u4IAAAAJ;N1-l4GsAAAAJ;LBEIm8gAAAAJ;https://scholar.google.co.il/citations?user=6Q-289IAAAAJ;AzlUrvUAAAAJ;https://scholar.google.com.tw/citations?user=bh-uRFMAAAAJ", "orcid": ";0000-0003-3500-8494;0000-0002-7126-5853;;;;0009-0006-7421-5858;", "linkedin": "yuvansharma/;giscard-biamby;;%E9%9B%A8%E6%A1%90-%E7%99%BD-59a44a136/;baifeng-shi-09171b188/;roei-herzig-7534615a/;;", "or_profile": "~Yuvan_Sharma1;~Giscard_Biamby1;~Jerome_Quenum1;~Yutong_Bai1;~Baifeng_Shi1;~Roei_Herzig2;~Niu_Dantong1;~trevor_darrell1", "aff": "University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;Johns Hopkins University;NVIDIA;University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department", "aff_domain": "berkeley.edu;berkeley.edu;berkeley.edu;jhu.edu;nvidia.com;berkeley.edu;berkeley.edu;eecs.berkeley.edu", "position": "Undergrad student;PhD student;PhD student;PhD student;Research Intern;Postdoc;PhD student;Professor", "bibtex": "@inproceedings{\nniu2024llarva,\ntitle={{LLARVA}: Vision-Action Instruction Tuning Enhances Robot Learning},\nauthor={Dantong Niu and Yuvan Sharma and Giscard Biamby and Jerome Quenum and Yutong Bai and Baifeng Shi and Trevor Darrell and Roei Herzig},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Q2lGXMZCv8}\n}", "github": "", "project": "", "reviewers": "WWDV;Uugi;s7WX", "site": "https://openreview.net/forum?id=Q2lGXMZCv8", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;5;5", "rating_avg": 3.0, "confidence_avg": 5.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14940008472659519766&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1;2;0;0;3", "aff_unique_norm": "University of California, Berkeley;Johns Hopkins University;NVIDIA;Electrical Engineering & Computer Science Department", "aff_unique_dep": ";;NVIDIA Corporation;Electrical Engineering & Computer Science", "aff_unique_url": "https://www.berkeley.edu;https://www.jhu.edu;https://www.nvidia.com;", "aff_unique_abbr": "UC Berkeley;JHU;NVIDIA;", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "QUzwHYJ9Hf", "title": "Towards Open-World Grasping with Large Vision-Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability to grasp objects in-the-wild from open-ended language instructions constitutes a fundamental challenge in robotics.\nAn open-world grasping system should be able to combine high-level contextual with low-level physical-geometric reasoning in order to be applicable in arbitrary scenarios.\nRecent works exploit the web-scale knowledge inherent in large language models (LLMs) to plan and reason in robotic context, but rely on external vision and action models to ground such knowledge into the environment and parameterize actuation.\nThis setup suffers from two major bottlenecks: a) the LLM's reasoning capacity is constrained by the quality of visual grounding, and b) LLMs do not contain low-level spatial understanding of the world, which is essential for grasping in contact-rich scenarios.\nIn this work we demonstrate that modern vision-language models (VLMs) are capable of tackling such limitations, as they are implicitly grounded and can jointly reason about semantics and geometry. \nWe propose \\texttt{OWG}, an open-world grasping pipeline that combines VLMs with segmentation and grasp synthesis models to unlock grounded world understanding in three stages: open-ended referring segmentation, grounded grasp planning and grasp ranking via contact reasoning, all of which can be applied zero-shot via suitable visual prompting mechanisms.\nWe conduct extensive evaluation in cluttered indoor scene datasets to showcase \\texttt{OWG}'s robustness in grounding from open-ended language, as well as open-world robotic grasping experiments in both simulation and hardware that demonstrate superior performance compared to previous supervised and zero-shot LLM-based methods.", "keywords": "Foundation Models for Robotics;Open-World Grasping;Open-Ended23 Visual Grounding;Robot Planning", "primary_area": "", "supplementary_material": "/attachment/010574e4a124d90f687efccc1a51a8cbf12f09be.zip", "author": "Georgios Tziafas;Hamidreza Kasaei", "authorids": "~Georgios_Tziafas1;~Hamidreza_Kasaei1", "gender": "M;M", "homepage": ";https://www.ai.rug.nl/hkasaei", "dblp": ";", "google_scholar": ";VFr_XuYAAAAJ", "orcid": ";", "linkedin": ";hamidreza-kasaei-49b83b57/", "or_profile": "~Georgios_Tziafas1;~Hamidreza_Kasaei1", "aff": "University of Groningen;University of Groningen", "aff_domain": "rug.nl;rug.nl", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\ntziafas2024towards,\ntitle={Towards Open-World Grasping with Large Vision-Language Models},\nauthor={Georgios Tziafas and Hamidreza Kasaei},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QUzwHYJ9Hf}\n}", "github": "https://github.com/gtziafas/OWG", "project": "", "reviewers": "hpr3;PGH6;HEJP;Adjp", "site": "https://openreview.net/forum?id=QUzwHYJ9Hf", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "4;4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3899325015338275802&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Groningen", "aff_unique_dep": "", "aff_unique_url": "https://www.rug.nl", "aff_unique_abbr": "RUG", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Netherlands" }, { "id": "Qoy12gkH4C", "title": "Progressive Multi-Modal Fusion for Robust 3D Object Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "Multi-sensor fusion is crucial for accurate 3D object detection in autonomous driving, with cameras and LiDAR being the most commonly used sensors. However, existing methods perform sensor fusion in a single view by projecting features from both modalities either in Bird's Eye View (BEV) or Perspective View (PV), thus sacrificing complementary information such as height or geometric proportions.\nTo address this limitation, we propose ProFusion3D, a progressive fusion framework that combines features in both BEV and PV at both intermediate and object query levels. Our architecture hierarchically fuses local and global features, enhancing the robustness of 3D object detection. Additionally, we introduce a self-supervised mask modeling pre-training strategy to improve multi-modal representation learning and data efficiency through three novel objectives. Extensive experiments on nuScenes and Argoverse2 datasets conclusively demonstrate the efficacy of ProFusion3D. Moreover, ProFusion3D is robust to sensor failure, showing strong performance when only one modality is available.", "keywords": "3D Object Detection;Multimodal Learning;Self-Supervised Learning", "primary_area": "", "supplementary_material": "/attachment/84d0f071fb9571bf71ec30ff71c769c084c95711.zip", "author": "Rohit Mohan;Daniele Cattaneo;Florian Drews;Abhinav Valada", "authorids": "~Rohit_Mohan1;~Daniele_Cattaneo1;florian.drews@de.bosch.com;~Abhinav_Valada1", "gender": "M;M;;M", "homepage": ";https://rl.uni-freiburg.de/people/cattaneo;;https://rl.uni-freiburg.de/people/valada", "dblp": ";;;81/9531", "google_scholar": "9emgsOwAAAAJ;https://scholar.google.it/citations?user=4Kif-mgAAAAJ;;https://scholar.google.de/citations?user=LcARjz0AAAAJ", "orcid": ";0000-0001-6662-5810;;0000-0003-4710-3114", "linkedin": ";;;avalada", "or_profile": "~Rohit_Mohan1;~Daniele_Cattaneo1;florian.drews@de.bosch.com;~Abhinav_Valada1", "aff": "Albert-Ludwigs-Universit\u00e4t Freiburg;Universit\u00e4t Freiburg;;University of Freiburg", "aff_domain": "uni-freiburg.de;uni-freiburg.de;;uni-freiburg.de", "position": "PhD student;Postdoc;;Full Professor", "bibtex": "@inproceedings{\nmohan2024progressive,\ntitle={Progressive Multi-Modal Fusion for Robust 3D Object Detection},\nauthor={Rohit Mohan and Daniele Cattaneo and Florian Drews and Abhinav Valada},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qoy12gkH4C}\n}", "github": "", "project": "", "reviewers": "nzZg;anjQ;9BjW", "site": "https://openreview.net/forum?id=Qoy12gkH4C", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4360174208060388115&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Albert-Ludwigs-Universit\u00e4t Freiburg;University of Freiburg", "aff_unique_dep": ";", "aff_unique_url": "https://www.uni-freiburg.de;https://www.uni-freiburg.de", "aff_unique_abbr": "Albert-Ludwigs-Universit\u00e4t;Uni Freiburg", "aff_campus_unique_index": "0", "aff_campus_unique": "Freiburg;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "Qpjo8l8AFW", "title": "Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Given the high cost of collecting robotic data in the real world, sample efficiency is a consistently compelling pursuit in robotics. In this paper, we introduce SGRv2, an imitation learning framework that enhances sample efficiency through improved visual and action representations. Central to the design of SGRv2 is the incorporation of a critical inductive bias\u2014$\\textit{action locality}$, which posits that robot's actions are predominantly influenced by the target object and its interactions with the local environment. Extensive experiments in both simulated and real-world settings demonstrate that action locality is essential for boosting sample efficiency. SGRv2 excels in RLBench tasks with keyframe control using merely 5 demonstrations and surpasses the RVT baseline in 23 of 26 tasks. Furthermore, when evaluated on ManiSkill2 and MimicGen using dense control, SGRv2's success rate is 2.54 times that of SGR. In real-world environments, with only eight demonstrations, SGRv2 can perform a variety of tasks at a markedly higher success rate compared to baseline models.", "keywords": "Robotic Manipulation;Sample Efficiency", "primary_area": "", "supplementary_material": "/attachment/8a177dc3291f0480ac651b587bd91a02a8f4e771.zip", "author": "Tong Zhang;Yingdong Hu;Jiacheng You;Yang Gao", "authorids": "~Tong_Zhang23;~Yingdong_Hu1;~Jiacheng_You1;~Yang_Gao1", "gender": ";M;M;M", "homepage": "https://tongzhangthu.github.io/;;https://github.com/YouJiacheng;http://yang-gao.weebly.com", "dblp": ";219/8916;;89/4402-29", "google_scholar": "https://scholar.google.com/citations?hl=en;HhotyAoAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": ";;;", "linkedin": ";;;yang-gao-45245348/", "or_profile": "~Tong_Zhang23;~Yingdong_Hu1;~Jiacheng_You1;~Yang_Gao1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024leveraging,\ntitle={Leveraging Locality to Boost Sample Efficiency in Robotic Manipulation},\nauthor={Tong Zhang and Yingdong Hu and Jiacheng You and Yang Gao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qpjo8l8AFW}\n}", "github": "https://github.com/TongZhangTHU/sgr", "project": "", "reviewers": "LbA4;tgDw;AWsF", "site": "https://openreview.net/forum?id=Qpjo8l8AFW", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7508065889414236415&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "QtCtY8zl2T", "title": "Task Success Prediction for Open-Vocabulary Manipulation Based on Multi-Level Aligned Representations", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this study, we consider the problem of predicting task success for open-vocabulary manipulation by a manipulator, based on instruction sentences and egocentric images before and after manipulation. Conventional approaches, including multimodal large language models (MLLMs), often fail to appropriately understand detailed characteristics of objects and/or subtle changes in the position of objects. We propose Contrastive $\\lambda$-Repformer, which predicts task success for table-top manipulation tasks by aligning images with instruction sentences. Our method integrates the following three key types of features into a multi-level aligned representation: features that preserve local image information; features aligned with natural language; and features structured through natural language. This allows the model to focus on important changes by looking at the differences in the representation between two images. We evaluate Contrastive $\\lambda$-Repformer on a dataset based on a large-scale standard dataset, the RT-1 dataset, and on a physical robot platform. The results show that our approach outperformed existing approaches including MLLMs. Our best model achieved an improvement of 8.66 points in accuracy compared to the representative MLLM-based model.", "keywords": "Task Success Prediction;Open-Vocabulary Manipulation;Multi-Level Aligned Visual Representation", "primary_area": "", "supplementary_material": "/attachment/fcdccef46710511cf035198d233ad70b123e6d01.zip", "author": "Miyu Goko;Motonari Kambara;Daichi Saito;Seitaro Otsuki;Komei Sugiura", "authorids": "~Miyu_Goko1;~Motonari_Kambara1;~Daichi_Saito1;~Seitaro_Otsuki1;~Komei_Sugiura1", "gender": "F;M;M;;M", "homepage": ";https://motonarikambara.github.io/;https://smilab.org/en/;;https://komeisugiura.jp/index_en.html", "dblp": ";296/4008;;321/6903;77/2654", "google_scholar": ";https://scholar.google.co.jp/citations?user=ptQaisMAAAAJ;;koZVTJ4AAAAJ;1Kd0W0oAAAAJ", "orcid": "0009-0009-9233-7121;;;0009-0009-8071-6060;0000-0002-0261-0510", "linkedin": ";;;;", "or_profile": "~Miyu_Goko1;~Motonari_Kambara1;~Daichi_Saito1;~Seitaro_Otsuki1;~Komei_Sugiura1", "aff": "Keio University;Keio University;Keio University;Keio University;Keio University", "aff_domain": "keio.ac.jp;keio.ac.jp;keio.jp;keio.jp;keio.jp", "position": "Undergrad student;PhD student;Undergrad student;MS student;Full Professor", "bibtex": "@inproceedings{\ngoko2024task,\ntitle={Task Success Prediction for Open-Vocabulary Manipulation Based on Multi-Level Aligned Representations},\nauthor={Miyu Goko and Motonari Kambara and Daichi Saito and Seitaro Otsuki and Komei Sugiura},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=QtCtY8zl2T}\n}", "github": "https://github.com/keio-smilab24/contrastive-lambda-repformer", "project": "", "reviewers": "ELhE;cPTW;yUdH", "site": "https://openreview.net/forum?id=QtCtY8zl2T", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12423598358155388884&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Keio University", "aff_unique_dep": "", "aff_unique_url": "https://www.keio.ac.jp", "aff_unique_abbr": "Keio", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Japan" }, { "id": "Qz2N4lWBk3", "title": "Learning Granular Media Avalanche Behavior for Indirectly Manipulating Obstacles on a Granular Slope", "track": "main", "status": "Poster", "tldr": "", "abstract": "Legged robot locomotion on sand slopes is challenging due to the complex dynamics of granular media and how the lack of solid surfaces can hinder locomotion. A promising strategy, inspired by ghost crabs and other organisms in nature, is to strategically interact with rocks, debris, and other obstacles to facilitate movement. To provide legged robots with this ability, we present a novel approach that leverages avalanche dynamics to indirectly manipulate objects on a granular slope. We use a Vision Transformer (ViT) to process image representations of granular dynamics and robot excavation actions. The ViT predicts object movement, which we use to determine which leg excavation action to execute. We collect training data from 100 real physical trials and, at test time, deploy our trained model in novel settings. Experimental results suggest that our model can accurately predict object movements and achieve a success rate \u2265 80% in a variety of manipulation tasks with up to four obstacles, and can also generalize to objects with different physics properties. To our knowledge, this is the first paper to leverage granular media avalanche dynamics to indirectly manipulate objects on granular slopes. Supplementary material is available at https://sites.google.com/view/grain-corl2024/home.", "keywords": "Granular media;Avalanche dynamics;Legged robots.", "primary_area": "", "supplementary_material": "/attachment/afa298a89ba38944c3dae3eff43a86ef897a3fd9.zip", "author": "Haodi Hu;Feifei Qian;Daniel Seita", "authorids": "~Haodi_Hu1;~Feifei_Qian1;~Daniel_Seita1", "gender": "M;F;M", "homepage": "https://sites.google.com/view/haodihu;https://viterbi.usc.edu/directory/faculty/Qian/Feifei;https://danielseita.github.io/", "dblp": "227/7639;;172/0917", "google_scholar": "Ez_AALsAAAAJ;SqYmRh0AAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;", "linkedin": ";;", "or_profile": "~Haodi_Hu1;~Feifei_Qian1;~Daniel_Takeshi_Seita1", "aff": "University of Southern California;University of Southern California;University of Southern California", "aff_domain": "usc.edu;usc.edu;usc.edu", "position": "PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhu2024learning,\ntitle={Learning Granular Media Avalanche Behavior for Indirectly Manipulating Obstacles on a Granular Slope},\nauthor={Haodi Hu and Feifei Qian and Daniel Seita},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Qz2N4lWBk3}\n}", "github": "", "project": "", "reviewers": "fr2L;egEL;M9RF", "site": "https://openreview.net/forum?id=Qz2N4lWBk3", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10114970272898940413&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Southern California", "aff_unique_dep": "", "aff_unique_url": "https://www.usc.edu", "aff_unique_abbr": "USC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Los Angeles", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "RMkdcKK7jq", "title": "SLR: Learning Quadruped Locomotion without Privileged Information", "track": "main", "status": "Poster", "tldr": "", "abstract": "Traditional reinforcement learning control for quadruped robots often relies on privileged information, demanding meticulous selection and precise estimation, thereby imposing constraints on the development process. This work proposes a Self-learning Latent Representation (SLR) method, which achieves high-performance control policy learning without the need for privileged information. To enhance the credibility of our proposed method's evaluation, SLR is compared with open-source code repositories of state-of-the-art algorithms, retaining the original authors' configuration parameters. Across four repositories, SLR consistently outperforms the reference results. Ultimately, the trained policy and encoder empower the quadruped robot to navigate steps, climb stairs, ascend rocks, and traverse various challenging terrains.", "keywords": "Locomotion;Reinforcement Learning;Privileged Learning", "primary_area": "", "supplementary_material": "/attachment/ca059a9b5a2d84d90d08d61406454e2d3fcc47f3.zip", "author": "Shiyi Chen;Zeyu Wan;Shiyang Yan;Chun Zhang;Weiyi Zhang;Qiang Li;Debing Zhang;Fasih Ud Din Farrukh", "authorids": "~Shiyi_Chen3;~Zeyu_Wan2;~Shiyang_Yan3;~Chun_Zhang4;~Weiyi_Zhang3;~Qiang_Li28;~Debing_Zhang4;~Fasih_Ud_Din_Farrukh1", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://github.com/11chens;;https://openai.com/;;https://sites.google.com/site/qiangliresearch/qiang-li-s-homepage;;https://www.sic.tsinghua.edu.cn/en/info/1086/1470.htm;", "dblp": ";;;;;;;", "google_scholar": ";;;https://scholar.google.com.hk/citations?user=pQiJ8GAAAAAJ;;SDL-2OkAAAAJ;;https://scholar.google.com.hk/citations?user=VwY4KfAAAAAJ", "orcid": ";0009-0003-7778-8473;;;;;;0009-0005-5598-1891", "linkedin": ";;;;;;;", "or_profile": "~Shiyi_Chen3;~Zeyu_Wan2;~Shiyang_Yan3;~Weiyi_Zhang3;~Qiang_Li28;~Fasih_Ud_Din_Farrukh1;~chun_zhang3;~Zhang_Debing1", "aff": "Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;Shenzhen Technology University;Tsinghua University ;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;mail.tsinghua.edu.cn;mail.tsinghua.edu.cn;mails.tsinghua.edu.cn;sztu.edu.cn;tsinghua.org.cn;cs.tsinghua.edu.cn;cs.tsinghua.edu.cn", "position": "MS student;MS student;MS student;PhD student;Full Professor;Postdoc;Full Professor;Instructor", "bibtex": "@inproceedings{\nchen2024slr,\ntitle={{SLR}: Learning Quadruped Locomotion without Privileged Information},\nauthor={Shiyi Chen and Zeyu Wan and Shiyang Yan and Chun Zhang and Weiyi Zhang and Qiang Li and Debing Zhang and Fasih Ud Din Farrukh},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=RMkdcKK7jq}\n}", "github": "", "project": "", "reviewers": "b8HJ;Bcyx;fW7r;ztJ4", "site": "https://openreview.net/forum?id=RMkdcKK7jq", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "3;4;5;5", "rating_avg": 2.75, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 8, "corr_rating_confidence": 0.8703882797784891, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7955879186406775349&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;0;1;0;0;0", "aff_unique_norm": "Tsinghua University;Shenzhen Technology University", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.sztu.edu.cn", "aff_unique_abbr": "THU;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "S2Jwb0i7HN", "title": "DextrAH-G: Pixels-to-Action Dexterous Arm-Hand Grasping with Geometric Fabrics", "track": "main", "status": "Poster", "tldr": "", "abstract": "A pivotal challenge in robotics is achieving fast, safe, and robust dexterous grasping across a diverse range of objects, an important goal within industrial applications. However, existing methods often have very limited speed, dexterity, and generality, along with limited or no hardware safety guarantees. In this work, we introduce DextrAH-G, a depth-based dexterous grasping policy trained entirely in simulation that combines reinforcement learning, geometric fabrics, and teacher-student distillation. We address key challenges in joint arm-hand policy learning, such as high-dimensional observation and action spaces, the sim2real gap, collision avoidance, and hardware constraints. DextrAH-G enables a 23 motor arm-hand robot to safely and continuously grasp and transport a large variety of objects at high speed using multi-modal inputs including depth images, allowing generalization across object geometry. Videos at https://sites.google.com/view/dextrah-g.", "keywords": "Dexterous Grasping;Geometric Fabrics;Reinforcement Learning;Teacher-Student Distillation;Sim-to-Real Transfer", "primary_area": "", "supplementary_material": "/attachment/2c398d9df21839ecaf6f24975854678d392ac1de.zip", "author": "Tyler Ga Wei Lum;Martin Matak;Viktor Makoviychuk;Ankur Handa;Arthur Allshire;Tucker Hermans;Nathan D. Ratliff;Karl Van Wyk", "authorids": "~Tyler_Ga_Wei_Lum1;~Martin_Matak1;~Viktor_Makoviychuk1;~Ankur_Handa1;~Arthur_Allshire1;~Tucker_Hermans2;~Nathan_D._Ratliff1;~Karl_Van_Wyk1", "gender": "M;M;M;M;;M;;", "homepage": "https://tylerlum.github.io/;https://martinmatak.github.io/;;http://ankurhanda.com;https://allshire.org;https://robot-learning.cs.utah.edu;;", "dblp": ";249/9331;;32/8653;;https://dblp.uni-trier.de/pid/67/4241;43/2704;", "google_scholar": "kPq6-XIAAAAJ;45-QOcYAAAAJ;rmAcDNkAAAAJ;sCTJI-0AAAAJ;https://scholar.google.ca/citations?user=TqsW7qMAAAAJ;G5_VFfkAAAAJ;https://scholar.google.com/citations?hl=en;TCYAoF8AAAAJ", "orcid": ";;;;;0000-0003-2496-2768;;", "linkedin": "tyler-lum/;martin-matak-38420348/;;;;;nathan-ratliff-b347018b/;", "or_profile": "~Tyler_Ga_Wei_Lum1;~Martin_Matak1;~Viktor_Makoviychuk1;~Ankur_Handa1;~Arthur_Allshire1;~Tucker_Hermans2;~Nathan_D._Ratliff1;~Karl_Van_Wyk1", "aff": "Stanford University;University of Utah;NVIDIA;Imperial College London;University of Toronto;University of Utah;NVIDIA;", "aff_domain": "stanford.edu;utah.edu;nvidia.com;imperial.ac.uk;utoronto.ca;utah.edu;nvidia.com;", "position": "PhD student;PhD student;Senior Research Scientist;Research Scientist;Undergrad student;Associate Professor;Researcher;", "bibtex": "@inproceedings{\nlum2024dextrahg,\ntitle={Dextr{AH}-G: Pixels-to-Action Dexterous Arm-Hand Grasping with Geometric Fabrics},\nauthor={Tyler Ga Wei Lum and Martin Matak and Viktor Makoviychuk and Ankur Handa and Arthur Allshire and Tucker Hermans and Nathan D. Ratliff and Karl Van Wyk},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S2Jwb0i7HN}\n}", "github": "", "project": "", "reviewers": "7Akv;1JfV;LfmX", "site": "https://openreview.net/forum?id=S2Jwb0i7HN", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;2;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9907772504003620384&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;4;1;2", "aff_unique_norm": "Stanford University;University of Utah;NVIDIA;Imperial College London;University of Toronto", "aff_unique_dep": ";;NVIDIA Corporation;;", "aff_unique_url": "https://www.stanford.edu;https://www.utah.edu;https://www.nvidia.com;https://www.imperial.ac.uk;https://www.utoronto.ca", "aff_unique_abbr": "Stanford;Utah;NVIDIA;ICL;U of T", "aff_campus_unique_index": "0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;1;2;0;0", "aff_country_unique": "United States;United Kingdom;Canada" }, { "id": "S70MgnIA0v", "title": "Robotic Control via Embodied Chain-of-Thought Reasoning", "track": "main", "status": "Poster", "tldr": "", "abstract": "A key limitation of learned robot control policies is their inability to generalize outside their training data. \nRecent works on vision-language-action models (VLAs) have shown that the use of large, internet pre-trained vision-language models as the backbone of learned robot policies can substantially improve their robustness and generalization ability. Yet, one of the most exciting capabilities of large vision-language models in other domains is their ability to reason iteratively through complex problems. Can that same capability be brought into robotics to allow policies to improve performance by reasoning about a given task before acting? Naive use of \"chain-of-thought\" (CoT) style prompting is significantly less effective with standard VLAs because of the relatively simple training examples that are available to them. Additionally, purely semantic reasoning about sub-tasks, as is common in regular CoT, is insufficient for robot policies that need to ground their reasoning in sensory observations and the robot state. To this end, we introduce Embodied Chain-of-Thought Reasoning (ECoT) for VLAs, in which we train VLAs to perform multiple steps of reasoning about plans, sub-tasks, motions, and visually grounded features like object bounding boxes and end effector positions, before predicting the robot action. We design a scalable pipeline for generating synthetic training data for ECoT on large robot datasets. We demonstrate, that ECoT increases the absolute success rate of OpenVLA, the current strongest open-source VLA policy, by 28\\% across challenging generalization tasks, without any additional robot training data. Additionally, ECoT makes it easier for humans to interpret a policy's failures and correct its behavior using natural language.", "keywords": "Vision-Language-Action Models;Embodied Chain-of-Thought Reasoning", "primary_area": "", "supplementary_material": "/attachment/c69c6e08f4c73715e7e300a83f8384f5e41e6aa7.zip", "author": "Micha\u0142 Zawalski;William Chen;Karl Pertsch;Oier Mees;Chelsea Finn;Sergey Levine", "authorids": "~Micha\u0142_Zawalski1;~William_Chen1;~Karl_Pertsch1;~Oier_Mees1;~Chelsea_Finn1;~Sergey_Levine1", "gender": "M;M;;M;F;M", "homepage": "https://michalzawalski.github.io/;;https://kpertsch.github.io/;https://www.oiermees.com/;https://ai.stanford.edu/~cbfinn/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "300/4651.html;;211/7137;190/8659;131/1783;80/7594", "google_scholar": "ljbCuVkAAAAJ;xUeq5EAAAAAJ;https://scholar.google.com/citations?view_op=list_works;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;vfPE6hgAAAAJ;8R35rCwAAAAJ", "orcid": "0000-0002-4063-2411;;;;;", "linkedin": "micha\u0142-zawalski;william-chen-a3956516b/;;oier-mees-a3069488;;", "or_profile": "~Micha\u0142_Zawalski1;~William_Chen1;~Karl_Pertsch1;~Oier_Mees1;~Chelsea_Finn1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Stanford University;Electrical Engineering & Computer Science Department, University of California, Berkeley;Google;Google", "aff_domain": "berkeley.edu;berkeley.edu;stanford.edu;eecs.berkeley.edu;google.com;google.com", "position": "Intern;PhD student;Postdoc;Postdoc;Research Scientist;Research Scientist", "bibtex": "@inproceedings{\nzawalski2024robotic,\ntitle={Robotic Control via Embodied Chain-of-Thought Reasoning},\nauthor={Micha{\\l} Zawalski and William Chen and Karl Pertsch and Oier Mees and Chelsea Finn and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S70MgnIA0v}\n}", "github": "https://github.com/MichalZawalski/embodied-CoT/", "project": "", "reviewers": "Nt3h;LC5L;DLrg", "site": "https://openreview.net/forum?id=S70MgnIA0v", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 58, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5854634104628931828&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;1;0;2;2", "aff_unique_norm": "University of California, Berkeley;Stanford University;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Stanford;Google", "aff_campus_unique_index": "0;0;1;0;2;2", "aff_campus_unique": "Berkeley;Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "S8jQtafbT3", "title": "Autonomous Interactive Correction MLLM for Robust Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability to reflect on and correct failures is crucial for robotic systems to interact stably with real-life objects. Observing the generalization and reasoning capabilities of Multimodal Large Language Models (MLLMs), previous approaches have aimed to utilize these models to enhance robotic systems accordingly. However, these methods typically focus on high-level planning corrections using an additional MLLM, with limited utilization of failed samples to correct low-level contact poses which is particularly prone to occur during articulated object manipulation. To address this gap, we propose an Autonomous Interactive Correction (AIC) MLLM, which makes use of previous low-level interaction experiences to correct SE(3) pose predictions for articulated object. Specifically, AIC MLLM is initially fine-tuned to acquire both pose prediction and feedback prompt comprehension abilities. We design two types of prompt instructions for interactions with objects: 1) visual masks to highlight unmovable parts for position correction, and 2) textual descriptions to indicate potential directions for rotation correction. During inference, a Feedback Information Extraction module is introduced to recognize the failure cause, allowing AIC MLLM to adaptively correct the pose prediction using the corresponding prompts. To further enhance manipulation stability, we devise a Test Time Adaptation strategy that enables AIC MLLM to better adapt to the current scene configuration. Finally, extensive experiments are conducted in both simulated and real-world environments to evaluate the proposed method. The results demonstrate that our AIC MLLM can efficiently correct failure samples by leveraging interaction experience prompts.", "keywords": "large language model;robotics", "primary_area": "", "supplementary_material": "/attachment/407e6cb3c4c9eaf3fd5238be9fa3328318186786.zip", "author": "Chuyan Xiong;Chengyu Shen;Xiaoqi Li;Kaichen Zhou;Jiaming Liu;Ruiping Wang;Hao Dong", "authorids": "~Chuyan_Xiong1;~Chengyu_Shen1;~Xiaoqi_Li3;~Kaichen_Zhou1;~Jiaming_Liu2;~Ruiping_Wang1;~Hao_Dong3", "gender": "F;;M;M;M;M;F", "homepage": "https://lxsy-xcy.github.io/;;http://zalex97.github.io/;https://github.com/liujiaming1996;https://rpwang.net/;https://zsdonghao.github.io;https://clorislili.github.io/clorisLi/", "dblp": ";;;;60/1529-1;14/1525-3.html;357/1937", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;https://scholar.google.com/citations?hl=zh-CN;;cPki5sUAAAAJ;duIUwpwAAAAJ;xLFL4sMAAAAJ;vkQ5_LIAAAAJ", "orcid": ";;;0000-0002-6770-4390;0000-0003-1830-2595;0000-0003-2261-9122;", "linkedin": ";;;;;;xiaoqi-li/", "or_profile": "~Chuyan_Xiong1;~Chengyu_Shen1;~Kaichen_Zhou1;~Jiaming_Liu2;~Ruiping_Wang1;~Hao_Dong3;~Xiaoqi_Cloris_Li1", "aff": "Beijing Jiaotong University;Xi'an Jiaotong University;Department of Computer Science, University of Oxford;Peking University;Institute of Computing Technology, Chinese Academy of Sciences;Peking University;Peking University", "aff_domain": "bjtu.edu.cn;xjtu.edu.cn;cs.ox.ac.uk;pku.edu.cn;ict.ac.cn;pku.edu.cn;pku.edu.cn", "position": "Undergrad student;Undergrad student;PhD student;PhD student;Full Professor;Assistant Professor;PhD student", "bibtex": "@inproceedings{\nxiong2024autonomous,\ntitle={Autonomous Interactive Correction {MLLM} for Robust Robotic Manipulation},\nauthor={Chuyan Xiong and Chengyu Shen and Xiaoqi Li and Kaichen Zhou and Jiaming Liu and Ruiping Wang and Hao Dong},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=S8jQtafbT3}\n}", "github": "", "project": "", "reviewers": "q3ru;ygE4;azV8", "site": "https://openreview.net/forum?id=S8jQtafbT3", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;3;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2249143201382252535&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3;4;3;3", "aff_unique_norm": "Beijing Jiao Tong University;Xi'an Jiao Tong University;University of Oxford;Peking University;Chinese Academy of Sciences", "aff_unique_dep": ";;Department of Computer Science;;Institute of Computing Technology", "aff_unique_url": "http://www.njtu.edu.cn/en;https://www.xjtu.edu.cn;https://www.ox.ac.uk;http://www.pku.edu.cn;http://www.ict.ac.cn", "aff_unique_abbr": "BJTU;XJTU;Oxford;Peking U;CAS", "aff_campus_unique_index": "1", "aff_campus_unique": ";Oxford", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "China;United Kingdom" }, { "id": "SFJz5iLvur", "title": "Lessons from Learning to Spin \u201cPens\u201d", "track": "main", "status": "Poster", "tldr": "", "abstract": "In-hand manipulation of pen-like objects is a most basic and important skill in our daily lives, as many tools such as hammers and screwdrivers are similarly shaped. However, current learning-based methods struggle with this task due to a lack of high-quality demonstrations and the significant gap between simulation and the real world. In this work, we push the boundaries of learning-based in-hand manipulation systems by demonstrating the capability to spin pen-like objects. We use reinforcement learning to train a policy and generate a high-fidelity trajectory dataset in simulation. This serves two purposes: 1) pre-training a sensorimotor policy in simulation; 2) conducting open-loop trajectory replay in the real world. We then fine-tune the sensorimotor policy using these real-world trajectories to adapt to the real world. With less than 50 trajectories, our policy learns to rotate more than ten pen-like objects with different physical properties for multiple revolutions. We present a comprehensive analysis of our design choices and share the lessons learned during development. Videos are shown on https://corl-2024-dexpen.github.io/.", "keywords": "Dexterous In-Hand Manipulation;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/ca3e4be843c98a124b7ae50e4d8a112dae45f781.zip", "author": "Jun Wang;Ying Yuan;Haichuan Che;Haozhi Qi;Yi Ma;Jitendra Malik;Xiaolong Wang", "authorids": "~Jun_Wang60;~Ying_Yuan2;~Haichuan_Che1;~Haozhi_Qi1;~Yi_Ma4;~Jitendra_Malik2;~Xiaolong_Wang3", "gender": ";F;M;M;M;M;M", "homepage": ";https://yingyuan0414.github.io;;https://haozhi.io/;http://people.eecs.berkeley.edu/~yima/;https://people.eecs.berkeley.edu/~malik/;https://xiaolonw.github.io/", "dblp": ";;;190/7802;;58/2944;91/952-4", "google_scholar": ";2IEoTWwAAAAJ;;https://scholar.google.com.hk/citations?user=iyVHKkcAAAAJ;https://scholar.google.com.hk/citations?user=XqLiBQMAAAAJ;oY9R5YQAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;;0000-0003-3695-1580;", "linkedin": ";;haichuan-che-7338721b1/;;;;", "or_profile": "~Jun_Wang60;~Ying_Yuan2;~Haichuan_Che1;~Haozhi_Qi1;~Yi_Ma4;~Jitendra_Malik2;~Xiaolong_Wang3", "aff": ";IIIS, Tsinghua University, Tsinghua University;University of California, San Diego;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley;University of California, San Diego", "aff_domain": ";mails.tsinghua.edu.cn;ucsd.edu;berkeley.edu;berkeley.edu;berkeley.edu;ucsd.edu", "position": ";Undergrad student;MS student;PhD student;Full Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024lessons,\ntitle={Lessons from Learning to Spin {\\textquotedblleft}Pens{\\textquotedblright}},\nauthor={Jun Wang and Ying Yuan and Haichuan Che and Haozhi Qi and Yi Ma and Jitendra Malik and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SFJz5iLvur}\n}", "github": "https://github.com/HaozhiQi/penspin", "project": "", "reviewers": "defC;rqxA;qCCU;U3nL", "site": "https://openreview.net/forum?id=SFJz5iLvur", "pdf_size": 0, "rating": "3;3;3;4", "confidence": "4;4;4;5", "rating_avg": 3.25, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 1.0, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2859577659021056918&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;2;1", "aff_unique_norm": "Tsinghua University;University of California, San Diego;University of California, Berkeley", "aff_unique_dep": "Institute for Interdisciplinary Information Sciences;;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.ucsd.edu;https://www.berkeley.edu", "aff_unique_abbr": "THU;UCSD;UC Berkeley", "aff_campus_unique_index": "0;1;2;2;2;1", "aff_campus_unique": "Beijing;San Diego;Berkeley", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "SW8ntpJl0E", "title": "JA-TN: Pick-and-Place Towel Shaping from Crumpled States based on TransporterNet with Joint-Probability Action Inference", "track": "main", "status": "Poster", "tldr": "", "abstract": "Towel manipulation is a crucial step towards more general cloth manipulation. However, folding a towel from an arbitrarily crumpled state and recovering from a failed folding step remain critical challenges in robotics. We propose joint-probability action inference JA-TN, as a way to improve TransporterNet's operational efficiency; to our knowledge, this is the first single data-driven policy to achieve various types of folding from most crumpled states. We present three benchmark domains with a set of shaping tasks and the corresponding oracle policies to facilitate the further development of the field. We also present a simulation-to-reality transfer procedure for vision-based deep learning controllers by processing and augmenting RGB and/or depth images. We also demonstrate JA-TN's ability to integrate with a real camera and a UR3e robot arm, showcasing the method's applicability to real-world tasks.", "keywords": "Cloth Manipulation;Imitation Learning;Sim2Real Transfer", "primary_area": "", "supplementary_material": "/attachment/ea32db4181760db3977befc3d2ab1d370e7547c2.zip", "author": "Halid Abdulrahim Kadi;Kasim Terzi\u0107", "authorids": "~Halid_Abdulrahim_Kadi1;~Kasim_Terzi\u01071", "gender": "M;Not Specified", "homepage": ";https://www.st-andrews.ac.uk/computer-science/people/kt54/", "dblp": ";86/5749.html", "google_scholar": ";https://scholar.google.co.uk/citations?user=kSIPwxMAAAAJ", "orcid": "0000-0001-9290-467X;0000-0001-6692-209X", "linkedin": ";", "or_profile": "~Halid_Abdulrahim_Kadi1;~Kasim_Terzi\u01071", "aff": "University of St. Andrews;University of St. Andrews", "aff_domain": "st-andrews.ac.uk;st-andrews.ac.uk", "position": "PhD student;Lecturer", "bibtex": "@inproceedings{\nkadi2024jatn,\ntitle={{JA}-{TN}: Pick-and-Place Towel Shaping from Crumpled States based on TransporterNet with Joint-Probability Action Inference},\nauthor={Halid Abdulrahim Kadi and Kasim Terzi{\\'c}},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SW8ntpJl0E}\n}", "github": "", "project": "", "reviewers": "4Ctm;WykJ;zcen", "site": "https://openreview.net/forum?id=SW8ntpJl0E", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14331194456437598579&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0", "aff_unique_norm": "University of St. Andrews", "aff_unique_dep": "", "aff_unique_url": "https://www.st-andrews.ac.uk", "aff_unique_abbr": "St. Andrews", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "SfaB20rjVo", "title": "An Open-Source Soft Robotic Platform for Autonomous Aerial Manipulation in the Wild", "track": "main", "status": "Poster", "tldr": "", "abstract": "Aerial manipulation combines the versatility and speed of flying platforms with the functional capabilities of mobile manipulation, which presents significant challenges due to the need for precise localization and control. Traditionally, researchers have relied on off-board perception systems, which are limited to expensive and impractical specially equipped indoor environments. In this work, we introduce a novel platform for autonomous aerial manipulation that exclusively utilizes onboard perception systems. Our platform can perform aerial manipulation in various indoor and outdoor environments without depending on external perception systems. Our experimental results demonstrate the platform's ability to autonomously grasp various objects in diverse settings. This advancement significantly improves the scalability and practicality of aerial manipulation applications by eliminating the need for costly tracking solutions. To accelerate future research, we open source our modern ROS 2 software stack and custom hardware design, making our contributions accessible to the broader research community.", "keywords": "Aerial Manipulation;Learning-Based Grasping;Autonomous Flight;Robotic Systems;Soft Grasping", "primary_area": "", "supplementary_material": "/attachment/d0c17be3befaf55bc02e6926bb8402218be065bd.zip", "author": "Erik Bauer;Marc Bl\u00f6chlinger;Pascal Strauch;Arman Raayatsanati;Cavelti Curdin;Robert K. Katzschmann", "authorids": "~Erik_Bauer1;~Marc_Bl\u00f6chlinger1;~Pascal_Strauch1;~Arman_Raayatsanati1;~Cavelti_Curdin1;~Robert_K._Katzschmann1", "gender": "M;M;M;;M;Not Specified", "homepage": ";;;https://www.raayatsanati.com/;;http://srl.ethz.ch", "dblp": "315/4075;315/5114;315/4376;315/4023;;139/3491", "google_scholar": "_GE0_goAAAAJ;;;TPHvcbQAAAAJ;;https://scholar.google.ch/citations?hl=en", "orcid": ";0009-0003-0891-6208;;0000-0002-8002-8649;0009-0007-3497-1971;0000-0001-7143-7259", "linkedin": "https://linkedin.com/in/erik-b-401baa18b;;;https://www.linkedin.com/asanati;;robertkatzschmann/", "or_profile": "~Erik_Bauer1;~Marc_Bl\u00f6chlinger1;~Pascal_Strauch1;~Arman_Raayatsanati1;~Cavelti_Curdin1;~Robert_Kevin_Katzschmann1", "aff": "BMW Group;ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETH Zurich;ETHZ - ETH Zurich;Swiss Federal Institute of Technology", "aff_domain": "bmwgroup.com;ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "Intern;MS student;MS student;MS student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nbauer2024an,\ntitle={An Open-Source Soft Robotic Platform for Autonomous Aerial Manipulation in the Wild},\nauthor={Erik Bauer and Marc Bl{\\\"o}chlinger and Pascal Strauch and Arman Raayatsanati and Cavelti Curdin and Robert K. Katzschmann},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=SfaB20rjVo}\n}", "github": "https://github.com/srl-ethz/osprey", "project": "", "reviewers": "BFE8;KZo7;ouWQ", "site": "https://openreview.net/forum?id=SfaB20rjVo", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=632080854122016773&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;1;1;1;1;2", "aff_unique_norm": "BMW Group;ETH Zurich;Swiss Federal Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.bmwgroup.com;https://www.ethz.ch;https://www.ethz.ch", "aff_unique_abbr": "BMW;ETHZ;ETH Zurich", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "Germany;Switzerland" }, { "id": "Si2krRESZb", "title": "TieBot: Learning to Knot a Tie from Visual Demonstration through a Real-to-Sim-to-Real Approach", "track": "main", "status": "Poster", "tldr": "", "abstract": "The tie-knotting task is highly challenging due to the tie's high deformation and long-horizon manipulation actions. This work presents TieBot, a Real-to-Sim-to-Real learning from visual demonstration system for the robots to learn to knot a tie. We introduce the Hierarchical Feature Matching approach to estimate a sequence of tie's meshes from the demonstration video. With these estimated meshes used as subgoals, we first learn a teacher policy using privileged information. Then, we learn a student policy with point cloud observation by imitating teacher policy. Lastly, our pipeline applies learned policy to real-world execution. We demonstrate the effectiveness of TieBot in simulation and the real world. In the real-world experiment, a dual-arm robot successfully knots a tie, achieving 50% success rate among 10 trials. Videos can be found on https://tiebots.github.io/.", "keywords": "cloth manipulation;learning from demonstration;robot learning", "primary_area": "", "supplementary_material": "/attachment/0d528373b86b54d8269529cac4e9d40497df375d.zip", "author": "Weikun Peng;Jun Lv;Yuwei Zeng;Haonan Chen;Siheng Zhao;Jichen Sun;Cewu Lu;Lin Shao", "authorids": "~Weikun_Peng1;~Jun_Lv2;~Yuwei_Zeng1;~Haonan_Chen4;~Siheng_Zhao1;~Jichen_Sun1;~Cewu_Lu3;~Lin_Shao2", "gender": ";M;;M;;M;M;M", "homepage": ";https://lyuj1998.github.io/;https://friolero.github.io/;https://github.com/chenhn02;https://sihengz02.github.io/;https://madcreeper.github.io/;https://www.mvig.org/;https://linsats.github.io/", "dblp": "317/1295;;;;341/1176;;;26/8546-2", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;DtaiAjwAAAAJ;PqvAzW4AAAAJ;;l7EAauYAAAAJ;;https://scholar.google.com.tw/citations?user=QZVQEWAAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0002-6720-2734;;;;;;;", "linkedin": "weikun-peng-7731281b4/;;;;;;;", "or_profile": "~Weikun_Peng1;~Jun_Lv2;~Yuwei_Zeng1;~Haonan_Chen4;~Siheng_Zhao1;~Jichen_Sun1;~Cewu_Lu3;~Lin_Shao2", "aff": "national university of singapore, National University of Singapore;Shanghai Jiaotong University;National University of Singapore;Nanjing University;Nanjing University;Shanghai Jiaotong University;Shanghai Jiaotong University;National University of Singapore", "aff_domain": "u.nus.edu;sjtu.edu.cn;comp.nus.edu.sg;nju.edu.cn;nju.edu.cn;sjtu.edu.cn;sjtu.edu.cn;nus.edu.sg", "position": "MS student;PhD student;PhD student;Undergrad student;Undergrad student;Undergrad student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\npeng2024tiebot,\ntitle={TieBot: Learning to Knot a Tie from Visual Demonstration through a Real-to-Sim-to-Real Approach},\nauthor={Weikun Peng and Jun Lv and Yuwei Zeng and Haonan Chen and Siheng Zhao and Jichen Sun and Cewu Lu and Lin Shao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Si2krRESZb}\n}", "github": "", "project": "", "reviewers": "XN7q;NNyg;eicD", "site": "https://openreview.net/forum?id=Si2krRESZb", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15811556620959477965&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;2;1;1;0", "aff_unique_norm": "National University of Singapore;Shanghai Jiao Tong University;Nanjing University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.sjtu.edu.cn;https://www.nju.edu.cn", "aff_unique_abbr": "NUS;SJTU;Nanjing U", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1;1;1;1;0", "aff_country_unique": "Singapore;China" }, { "id": "TzqKmIhcwq", "title": "Structured Bayesian Meta-Learning for Data-Efficient Visual-Tactile Model Estimation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Estimating visual-tactile models of deformable objects is challenging because vision suffers from occlusion, while touch data is sparse and noisy. We propose a novel data-efficient method for dense heterogeneous model estimation by leveraging experience from diverse training objects. The method is based on Bayesian Meta-Learning (BML), which can mitigate overfitting high-capacity visual-tactile models by meta-learning an informed prior and naturally achieves few-shot online estimation via posterior estimation. However, BML requires a shared parametric model across tasks but visual-tactile models for diverse objects have different parameter spaces. To address this issue, we introduce Structured Bayesian Meta-Learning (SBML) that incorporates heterogeneous physics models, enabling learning from training objects with varying appearances and geometries. SBML performs zero-shot vision-only prediction of deformable model parameters and few-shot adaptation after a handful of touches. Experiments show that in two classes of heterogeneous objects, namely plants and shoes, SBML outperforms existing approaches in force and torque prediction accuracy in zero- and few-shot settings.", "keywords": "Multimodal perception;tactile sensing;few-shot learning", "primary_area": "", "supplementary_material": "/attachment/54fb42774df5ae80aef632d7290e7d5355a7d04f.zip", "author": "Shaoxiong Yao;Yifan Zhu;Kris Hauser", "authorids": "~Shaoxiong_Yao1;~Yifan_Zhu8;~Kris_Hauser2", "gender": "M;M;M", "homepage": "https://shaoxiongyao.github.io/;http://kkhauser.web.illinois.edu;https://yifanzhu95.github.io/", "dblp": ";;", "google_scholar": "kT71zbIAAAAJ;-sGaL8sAAAAJ;eyWZo6EAAAAJ", "orcid": ";;", "linkedin": "shaoxiong-yao-8980a916a/;;", "or_profile": "~Shaoxiong_Yao1;~Kris_Hauser2;~Yifan_Zhu9", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana-Champaign;Yale University", "aff_domain": "illinois.edu;illinois.edu;yale.edu", "position": "PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\nyao2024structured,\ntitle={Structured Bayesian Meta-Learning for Data-Efficient Visual-Tactile Model Estimation},\nauthor={Shaoxiong Yao and Yifan Zhu and Kris Hauser},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=TzqKmIhcwq}\n}", "github": "", "project": "", "reviewers": "oWYD;57dj;HDBy", "site": "https://openreview.net/forum?id=TzqKmIhcwq", "pdf_size": 0, "rating": "2;3;4", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7844990449822547675&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2", "aff_unique_norm": "University of Illinois Urbana-Champaign;University of Illinois;Yale University", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://illinois.edu;https://www.yale.edu", "aff_unique_abbr": "UIUC;UIUC;Yale", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "U5RPcnFhkq", "title": "FetchBench: A Simulation Benchmark for Robot Fetching", "track": "main", "status": "Poster", "tldr": "", "abstract": "Fetching, which includes approaching, grasping, and retrieving, is a critical challenge for robot manipulation tasks. Existing methods primarily focus\u00a0on table-top scenarios, which do not adequately capture the complexities of environments where both grasping and planning are essential. To address this gap, we propose a new benchmark FetchBench, featuring diverse procedural scenes\u00a0that integrate both grasping and motion planning challenges. Additionally, FetchBench includes a data generation pipeline that collects successful fetch trajectories for use in imitation learning methods. We implement multiple baselines from the traditional sense-plan-act pipeline to end-to-end behavior models. Our empirical\u00a0analysis reveals that these methods achieve a maximum success rate of only 20%, indicating substantial room for improvement. Additionally, we identify key bottlenecks within the sense-plan-act pipeline and make recommendations based on the systematic analysis.", "keywords": "Grasping; Benchmark; Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/852513ee9b5026a9d27df51c5e0fd280e7333512.zip", "author": "Beining Han;Meenal Parakh;Derek Geng;Jack A Defay;Gan Luyang;Jia Deng", "authorids": "~Beining_Han1;~Meenal_Parakh1;~Derek_Geng1;~Jack_A_Defay1;~Gan_Luyang1;~Jia_Deng1", "gender": "M;F;M;M;F;M", "homepage": ";;;https://jackdefay.com/;;", "dblp": "266/7819;357/5567;;;;07/6526-1.html", "google_scholar": "LVjU7xIAAAAJ;;;;;U3Eub-EAAAAJ", "orcid": ";;;0000-0003-3770-6608;;", "linkedin": "%E8%B4%9D%E5%AE%81-%E9%9F%A9-b79204207/details/experience/;meenal-parakh-05a359254/;derek-geng-932763253/;jack-defay-008826188;gan-luyang-0956b8286/;", "or_profile": "~Beining_Han1;~Meenal_Parakh1;~Derek_Geng1;~Jack_A_Defay1;~Gan_Luyang1;~Jia_Deng1", "aff": "Department of Computer Science, Princeton University;Princeton University;Princeton University;Princeton University;Princeton University;Princeton University", "aff_domain": "cs.princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu;princeton.edu", "position": "PhD student;PhD student;Undergrad student;MS student;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nhan2024fetchbench,\ntitle={FetchBench: A Simulation Benchmark for Robot Fetching},\nauthor={Beining Han and Meenal Parakh and Derek Geng and Jack A Defay and Gan Luyang and Jia Deng},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=U5RPcnFhkq}\n}", "github": "https://github.com/princeton-vl/FetchBench-CORL2024", "project": "", "reviewers": "vPSQ;qTGA;XzpA", "site": "https://openreview.net/forum?id=U5RPcnFhkq", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;3;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2093681566922994702&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Princeton University", "aff_unique_dep": "Department of Computer Science", "aff_unique_url": "https://www.princeton.edu", "aff_unique_abbr": "Princeton", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "UHxPZgK33I", "title": "RoboEXP: Action-Conditioned Scene Graph via Interactive Exploration for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce the novel task of interactive scene exploration, wherein robots autonomously explore environments and produce an action-conditioned scene graph (ACSG) that captures the structure of the underlying environment. The ACSG accounts for both low-level information (geometry and semantics) and high-level information (action-conditioned relationships between different entities) in the scene. To this end, we present the Robotic Exploration (RoboEXP) system, which incorporates the Large Multimodal Model (LMM) and an explicit memory design to enhance our system's capabilities. The robot reasons about what and how to explore an object, accumulating new information through the interaction process and incrementally constructing the ACSG.\nLeveraging the constructed ACSG, we illustrate the effectiveness and efficiency of our RoboEXP system in facilitating a wide range of real-world manipulation tasks involving rigid, articulated objects, nested objects, and deformable objects. Project Page: https://jianghanxiao.github.io/roboexp-web/", "keywords": "Action-Conditioned Scene Graph;Foundation Models for Robotics;Scene Exploration;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/bff80795c3b8d58cb88027fd345ec7fb0087ec09.zip", "author": "Hanxiao Jiang;Binghao Huang;Ruihai Wu;Zhuoran Li;Shubham Garg;Hooshang Nayyeri;Shenlong Wang;Yunzhu Li", "authorids": "~Hanxiao_Jiang1;~Binghao_Huang1;~Ruihai_Wu1;~Zhuoran_Li3;~Shubham_Garg1;~Hooshang_Nayyeri1;~Shenlong_Wang1;~Yunzhu_Li1", "gender": "M;;M;;M;M;M;M", "homepage": "https://jianghanxiao.github.io/;https://binghao-huang.github.io/;https://warshallrho.github.io/;https://github.com/DavidLzr;;;https://shenlong.web.illinois.edu/;https://yunzhuli.github.io/", "dblp": "196/3481-1;;248/8028.html;;;;117/4842;182/1831", "google_scholar": "-XWZKZAAAAAJ;nqoOetAAAAAJ;https://scholar.google.com/citations?hl=en;;;;QFpswmcAAAAJ;WlA92lcAAAAJ", "orcid": "0000-0001-6245-361X;;;;;;;", "linkedin": ";;;;shubham8garg/;hooshang-nayyeri/;shenlong-wang-3496023b;", "or_profile": "~Hanxiao_Jiang1;~Binghao_Huang1;~Ruihai_Wu1;~Zhuoran_Li3;~Shubham_Garg1;~Hooshang_Nayyeri1;~Shenlong_Wang1;~Yunzhu_Li1", "aff": "University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign;Peking University;National University of Singapore;Amazon;Amazon;University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu;pku.edu.cn;u.nus.edu;amazon.com;amazon.com;illinois.edu;illinois.edu", "position": "PhD student;PhD student;PhD student;Undergrad student;Senior Applied Scientist;Researcher;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\njiang2024roboexp,\ntitle={Robo{EXP}: Action-Conditioned Scene Graph via Interactive Exploration for Robotic Manipulation},\nauthor={Hanxiao Jiang and Binghao Huang and Ruihai Wu and Zhuoran Li and Shubham Garg and Hooshang Nayyeri and Shenlong Wang and Yunzhu Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UHxPZgK33I}\n}", "github": "https://github.com/Jianghanxiao/RoboEXP", "project": "", "reviewers": "5sxQ;hwZM;Pzsb", "site": "https://openreview.net/forum?id=UHxPZgK33I", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;3;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8304124621072885519&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;2;3;3;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Peking University;National University of Singapore;Amazon", "aff_unique_dep": ";;;Amazon.com, Inc.", "aff_unique_url": "https://illinois.edu;http://www.pku.edu.cn;https://www.nus.edu.sg;https://www.amazon.com", "aff_unique_abbr": "UIUC;Peking U;NUS;Amazon", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Urbana-Champaign;", "aff_country_unique_index": "0;0;1;2;0;0;0;0", "aff_country_unique": "United States;China;Singapore" }, { "id": "URj5TQTAXM", "title": "OKAMI: Teaching Humanoid Robots Manipulation Skills through Single Video Imitation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We study the problem of teaching humanoid robots manipulation skills by imitating from single video demonstrations. We introduce OKAMI, a method that generates a manipulation plan from a single RGB-D video and derives a policy for execution. At the heart of our approach is object-aware retargeting, which enables the humanoid robot to mimic the human motions in an RGB-D video while adjusting to different object locations during deployment. OKAMI uses open-world vision models to identify task-relevant objects and retarget the body motions and hand poses separately. Our experiments show that OKAMI achieves strong generalizations across varying visual and spatial conditions, outperforming the state-of-the-art baseline on open-world imitation from observation. Furthermore, OKAMI rollout trajectories are leveraged to train closed-loop visuomotor policies, which achieve an average success rate of $79.2\\%$ without the need for labor-intensive teleoperation. More videos can be found on our \nwebsite https://ut-austin-rpl.github.io/OKAMI/.", "keywords": "Humanoid Manipulation;Imitation From Videos;Motion Retargeting", "primary_area": "", "supplementary_material": "/attachment/0add412bd8194e5e7de34a1a5ecbd253c09c65b8.zip", "author": "Jinhan Li;Yifeng Zhu;Yuqi Xie;Zhenyu Jiang;Mingyo Seo;Georgios Pavlakos;Yuke Zhu", "authorids": "~Jinhan_Li2;~Yifeng_Zhu2;~Yuqi_Xie1;~Zhenyu_Jiang1;~Mingyo_Seo1;~Georgios_Pavlakos1;~Yuke_Zhu1", "gender": "F;M;M;M;;M;M", "homepage": "https://lijinhan21.github.io/;https://cs.utexas.edu/~yifengz;https://xieleo5.github.io/;https://zhenyujiang.me/;https://mingyoseo.com;https://geopavlakos.github.io/;https://cs.utexas.edu/~yukez/", "dblp": ";;;55/10479-2;;145/3361;133/1772", "google_scholar": ";;;2KLTzZIAAAAJ;;iH2BZ8UAAAAJ;mWGyYMsAAAAJ", "orcid": ";;;0000-0002-9711-7461;;;", "linkedin": ";;;;;;", "or_profile": "~Jinhan_Li2;~Yifeng_Zhu2;~Yuqi_Xie1;~Zhenyu_Jiang1;~Mingyo_Seo1;~Georgios_Pavlakos1;~Yuke_Zhu1", "aff": "Tsinghua University;The University of Texas at Austin;University of Texas at Austin;University of Texas, Austin;University of Texas at Austin;University of Texas at Austin;Computer Science Department, University of Texas, Austin", "aff_domain": "mails.tsinghua.edu.cn;utexas.edu;utexas.edu;utexas.edu;utexas.edu;cs.utexas.edu;cs.utexas.edu", "position": "Undergrad student;PhD student;MS student;PhD student;PhD student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nli2024okami,\ntitle={{OKAMI}: Teaching Humanoid Robots Manipulation Skills through Single Video Imitation},\nauthor={Jinhan Li and Yifeng Zhu and Yuqi Xie and Zhenyu Jiang and Mingyo Seo and Georgios Pavlakos and Yuke Zhu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=URj5TQTAXM}\n}", "github": "", "project": "", "reviewers": "MV4q;MHD2;Qi9R;XCSN", "site": "https://openreview.net/forum?id=URj5TQTAXM", "pdf_size": 0, "rating": "3;3;3;4", "confidence": "5;3;3;4", "rating_avg": 3.25, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 7, "corr_rating_confidence": 0.17407765595569782, "gs_citation": 33, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12378918091517497211&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "Tsinghua University;University of Texas at Austin", "aff_unique_dep": ";", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.utexas.edu", "aff_unique_abbr": "THU;UT Austin", "aff_campus_unique_index": "1;1;1;1;1;1", "aff_campus_unique": ";Austin", "aff_country_unique_index": "0;1;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "UUZ4Yw3lt0", "title": "Harmon: Whole-Body Motion Generation of Humanoid Robots from Language Descriptions", "track": "main", "status": "Poster", "tldr": "", "abstract": "Humanoid robots, with their human-like embodiment, have the potential to integrate seamlessly into human environments. Critical to their coexistence and cooperation with humans is the ability to understand natural language communications and exhibit human-like behaviors. This work focuses on generating diverse whole-body motions for humanoid robots from language descriptions. We leverage human motion priors from extensive human motion datasets to initialize humanoid motions and employ the commonsense reasoning capabilities of Vision Language Models (VLMs) to edit and refine these motions. Our approach demonstrates the capability to produce natural, expressive, and text-aligned humanoid motions, validated through both simulated and real-world experiments. More videos can be found on our website https://ut-austin-rpl.github.io/Harmon/.", "keywords": "Humanoid Robot;Whole-Body Motion Generation", "primary_area": "", "supplementary_material": "/attachment/f2e7bcd50854a774d2eec4b21a4d449681f544f1.zip", "author": "Zhenyu Jiang;Yuqi Xie;Jinhan Li;Ye Yuan;Yifeng Zhu;Yuke Zhu", "authorids": "~Zhenyu_Jiang1;~Yuqi_Xie1;~Jinhan_Li2;~Ye_Yuan5;~Yifeng_Zhu2;~Yuke_Zhu1", "gender": "M;M;F;M;M;M", "homepage": "https://zhenyujiang.me/;https://xieleo5.github.io/;https://lijinhan21.github.io/;https://www.ye-yuan.com;https://cs.utexas.edu/~yifengz;https://cs.utexas.edu/~yukez/", "dblp": "55/10479-2;;;33/6315-7;;133/1772", "google_scholar": "2KLTzZIAAAAJ;;;EEp82sIAAAAJ;;mWGyYMsAAAAJ", "orcid": "0000-0002-9711-7461;;;;;", "linkedin": ";;;;;", "or_profile": "~Zhenyu_Jiang1;~Yuqi_Xie1;~Jinhan_Li2;~Ye_Yuan5;~Yifeng_Zhu2;~Yuke_Zhu1", "aff": "University of Texas, Austin;University of Texas at Austin;Tsinghua University;NVIDIA Research;The University of Texas at Austin;Computer Science Department, University of Texas, Austin", "aff_domain": "utexas.edu;utexas.edu;mails.tsinghua.edu.cn;nvidia.com;utexas.edu;cs.utexas.edu", "position": "PhD student;MS student;Undergrad student;Researcher;PhD student;Assistant Professor", "bibtex": "@inproceedings{\njiang2024harmon,\ntitle={Harmon: Whole-Body Motion Generation of Humanoid Robots from Language Descriptions},\nauthor={Zhenyu Jiang and Yuqi Xie and Jinhan Li and Ye Yuan and Yifeng Zhu and Yuke Zhu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=UUZ4Yw3lt0}\n}", "github": "", "project": "", "reviewers": "Ynst;HfRF;4CM1", "site": "https://openreview.net/forum?id=UUZ4Yw3lt0", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16557792497094658846&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0;0", "aff_unique_norm": "University of Texas at Austin;Tsinghua University;NVIDIA", "aff_unique_dep": ";;NVIDIA Research", "aff_unique_url": "https://www.utexas.edu;https://www.tsinghua.edu.cn;https://www.nvidia.com/research", "aff_unique_abbr": "UT Austin;THU;NVIDIA", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Austin;", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "United States;China" }, { "id": "Uaaj4MaVIQ", "title": "D$^3$Fields: Dynamic 3D Descriptor Fields for Zero-Shot Generalizable Rearrangement", "track": "main", "status": "Poster", "tldr": "", "abstract": "Scene representation is a crucial design choice in robotic manipulation systems. An ideal representation is expected to be 3D, dynamic, and semantic to meet the demands of diverse manipulation tasks. However, previous works often lack all three properties simultaneously. In this work, we introduce D$^3$Fields---**dynamic 3D descriptor fields**. These fields are **implicit 3D representations** that take in 3D points and output semantic features and instance masks. They can also capture the dynamics of the underlying 3D environments. Specifically, we project arbitrary 3D points in the workspace onto multi-view 2D visual observations and interpolate features derived from visual foundational models. The resulting fused descriptor fields allow for flexible goal specifications using 2D images with varied contexts, styles, and instances. To evaluate the effectiveness of these descriptor fields, we apply our representation to rearrangement tasks in a zero-shot manner. Through extensive evaluation in real worlds and simulations, we demonstrate that D$^3$Fields are effective for **zero-shot generalizable** rearrangement tasks. We also compare D$^3$Fields with state-of-the-art implicit 3D representations and show significant improvements in effectiveness and efficiency. Project page: https://robopil.github.io/d3fields/", "keywords": "Implicit 3D Representation;Visual Foundational Model;Zero-Shot Generalization;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/e38855025ad6430f3c43d09014b4baa9ee805ed4.zip", "author": "Yixuan Wang;Mingtong Zhang;Zhuoran Li;Tarik Kelestemur;Katherine Rose Driggs-Campbell;Jiajun Wu;Li Fei-Fei;Yunzhu Li", "authorids": "~Yixuan_Wang2;~Mingtong_Zhang1;~Zhuoran_Li3;~Tarik_Kelestemur1;~Katherine_Rose_Driggs-Campbell1;~Jiajun_Wu1;~Li_Fei-Fei1;~Yunzhu_Li1", "gender": "M;M;;;;M;F;M", "homepage": "https://wangyixuan12.github.io/;https://robo-alex.github.io/;https://github.com/DavidLzr;https://kelestemur.com/;;https://jiajunwu.com;https://profiles.stanford.edu/fei-fei-li;https://yunzhuli.github.io/", "dblp": "44/4317-3;;;;;117/4768;79/2528;182/1831", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;2efgcS0AAAAJ;rDfyQnIAAAAJ;WlA92lcAAAAJ", "orcid": "0009-0006-6641-4718;;;;;0000-0002-4176-343X;;", "linkedin": "yixuan-wang-54298115a;;;;;jiajunwu/;fei-fei-li-4541247/;", "or_profile": "~Yixuan_Wang2;~Mingtong_Zhang1;~Zhuoran_Li3;~Tarik_Kelestemur1;~Katherine_Rose_Driggs-Campbell1;~Jiajun_Wu1;~Li_Fei-Fei1;~Yunzhu_Li1", "aff": "University of Illinois, Urbana Champaign;University of Illinois, Urbana Champaign;National University of Singapore;Boston Dynamics AI Institute;;Stanford University;Stanford University;University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu;u.nus.edu;theaiinstitute.com;;stanford.edu;stanford.edu;illinois.edu", "position": "MS student;MS student;Undergrad student;Researcher;;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024dfields,\ntitle={D\\${\\textasciicircum}3\\$Fields: Dynamic 3D Descriptor Fields for Zero-Shot Generalizable Rearrangement},\nauthor={Yixuan Wang and Mingtong Zhang and Zhuoran Li and Tarik Kelestemur and Katherine Rose Driggs-Campbell and Jiajun Wu and Li Fei-Fei and Yunzhu Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Uaaj4MaVIQ}\n}", "github": "https://github.com/WangYixuan12/d3fields", "project": "", "reviewers": "vc8A;FgtV;EQkG", "site": "https://openreview.net/forum?id=Uaaj4MaVIQ", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13120350531640855084&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2;3;3;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;National University of Singapore;Boston Dynamics AI Institute;Stanford University", "aff_unique_dep": ";;AI Institute;", "aff_unique_url": "https://illinois.edu;https://www.nus.edu.sg;https://www.bostondynamics.com/;https://www.stanford.edu", "aff_unique_abbr": "UIUC;NUS;BD AI;Stanford", "aff_campus_unique_index": "0;0;2;2;0", "aff_campus_unique": "Urbana-Champaign;;Stanford", "aff_country_unique_index": "0;0;1;0;0;0;0", "aff_country_unique": "United States;Singapore" }, { "id": "V5x0m6XDSV", "title": "Differentiable Discrete Elastic Rods for Real-Time Modeling of Deformable Linear Objects", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper addresses the task of modeling Deformable Linear Objects (DLOs), such as ropes and cables, during dynamic motion over long time horizons. This task presents significant challenges due to the complex dynamics of DLOs. To address these challenges, this paper proposes differentiable Discrete Elastic Rods For deformable linear Objects with Real-time Modeling (DEFORM), a novel framework that combines a differentiable physics-based model with a learning framework to model DLOs accurately and in real-time. The performance of DEFORM is evaluated in an experimental setup involving two industrial robots and a variety of sensors. A comprehensive series of experiments demonstrate the efficacy of DEFORM in terms of accuracy, computational speed, and generalizability when compared to state-of-the-art alternatives. To further demonstrate the utility of DEFORM, this paper integrates it into a perception pipeline and illustrates its superior performance when compared to the state-of-the-art methods while tracking a DLO even in the presence of occlusions. Finally, this paper illustrates the superior performance of DEFORM when compared to state-of-the-art methods when it is applied to perform autonomous planning and control of DLOs.", "keywords": "Deformable Linear Objects Modeling;Physics-Informed Learning;Differentiable Simulation", "primary_area": "", "supplementary_material": "/attachment/7d48de4444b5d8e28b85cce26d1bec9477006a92.zip", "author": "Yizhou Chen;Yiting Zhang;Zachary Brei;Tiancheng Zhang;Yuzhen Chen;Julie Wu;Ram Vasudevan", "authorids": "~Yizhou_Chen4;~Yiting_Zhang3;breizach@umich.edu;~Tiancheng_Zhang3;~Yuzhen_Chen1;jwuxx@umich.edu;~Ram_Vasudevan2", "gender": "M;M;;M;M;;", "homepage": ";https://sites.google.com/umich.edu/yitingzhang/;;https://sites.google.com/umich.edu/tiancheng-zhang-homepage/home;;;", "dblp": ";;;;;;", "google_scholar": ";sbPYzZ8AAAAJ;;;;;", "orcid": ";0009-0004-6623-3162;;;;;", "linkedin": "yizhou-chen-325819184/;yiting-zhang-6929791a7/;;tiancheng-zhang-70a8b2224/;yuzhench;;", "or_profile": "~Yizhou_Chen4;~Yiting_Zhang3;breizach@umich.edu;~Tiancheng_Zhang3;~Yuzhen_Chen1;jwuxx@umich.edu;~Ram_Vasudevan2", "aff": "University of Michigan - Ann Arbor;University of Michigan - Ann Arbor;;;University of Michigan - Ann Arbor;;", "aff_domain": "umich.edu;umich.edu;;;umich.edu;;", "position": "PhD student;PhD student;;;Undergrad student;;", "bibtex": "@inproceedings{\nchen2024differentiable,\ntitle={Differentiable Discrete Elastic Rods for Real-Time Modeling of Deformable Linear Objects},\nauthor={Yizhou Chen and Yiting Zhang and Zachary Brei and Tiancheng Zhang and Yuzhen Chen and Julie Wu and Ram Vasudevan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=V5x0m6XDSV}\n}", "github": "https://github.com/roahmlab/DEFORM", "project": "", "reviewers": "jfrQ;rJLv;ktjq", "site": "https://openreview.net/forum?id=V5x0m6XDSV", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6832948218315928158&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Michigan", "aff_unique_dep": "", "aff_unique_url": "https://www.umich.edu", "aff_unique_abbr": "UM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Ann Arbor", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "VFs1vbQnYN", "title": "Sim-to-Real Transfer via 3D Feature Fields for Vision-and-Language Navigation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Vision-and-language navigation (VLN) enables the agent to navigate to a remote location in 3D environments following the natural language instruction. In this field, the agent is usually trained and evaluated in the navigation simulators, lacking effective approaches for sim-to-real transfer. The VLN agents with only a monocular camera exhibit extremely limited performance, while the mainstream VLN models trained with panoramic observation, perform better but are difficult to deploy on most monocular robots. For this case, we propose a sim-to-real transfer approach to endow the monocular robots with panoramic traversability perception and panoramic semantic understanding, thus smoothly transferring the high-performance panoramic VLN models to the common monocular robots. In this work, the semantic traversable map is proposed to predict agent-centric navigable waypoints, and the novel view representations of these navigable waypoints are predicted through the 3D feature fields. These methods broaden the limited field of view of the monocular robots and significantly improve navigation performance in the real world. Our VLN system outperforms previous SOTA monocular VLN methods in R2R-CE and RxR-CE benchmarks within the simulation environments and is also validated in real-world environments, providing a practical and high-performance solution for real-world VLN.", "keywords": "Vision-and-Language Navigation;3D Feature Fields;Semantic Traversable Map", "primary_area": "", "supplementary_material": "/attachment/3be17b1353b8a579bb2949f4cd086341b177df87.zip", "author": "Zihan Wang;Xiangyang Li;Jiahao Yang;Yeqi Liu;Shuqiang Jiang", "authorids": "~Zihan_Wang11;~Xiangyang_Li2;~Jiahao_Yang5;~Yeqi_Liu2;~Shuqiang_Jiang1", "gender": "M;M;;;M", "homepage": ";https://xiangyangli-cn.github.io/;;;https://people.ucas.edu.cn/~sqjiang?language=en", "dblp": ";80/4579-2;;;90/3651", "google_scholar": "https://scholar.google.cz/citations?user=7rf6Bw4AAAAJ;n6WBCgUAAAAJ;;;4Rvn-ykAAAAJ", "orcid": ";0000-0002-3944-4704;;;0000-0002-1596-4326", "linkedin": ";;;;", "or_profile": "~Zihan_Wang11;~Xiangyang_Li2;~Jiahao_Yang5;~Yeqi_Liu2;~Shuqiang_Jiang1", "aff": "Chinese Academy of Sciences;Institute of Computing Technology, Chinese Academy of Sciences;;;Institute of Computing Technology, Chinese Academy of Sciences", "aff_domain": "ict.ac.cn;ict.ac.cn;;;ict.ac.cn", "position": "MS student;Assistant Professor;;;Professor", "bibtex": "@inproceedings{\nwang2024simtoreal,\ntitle={Sim-to-Real Transfer via 3D Feature Fields for Vision-and-Language Navigation},\nauthor={Zihan Wang and Xiangyang Li and Jiahao Yang and Yeqi Liu and Shuqiang Jiang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VFs1vbQnYN}\n}", "github": "https://github.com/MrZihan/Sim2Real-VLN-3DFF", "project": "", "reviewers": "yffA;QJ8S;U8bM", "site": "https://openreview.net/forum?id=VFs1vbQnYN", "pdf_size": 0, "rating": "3;3;3", "confidence": "1;4;4", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10803752933253890044&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Chinese Academy of Sciences", "aff_unique_dep": "", "aff_unique_url": "https://www.cas.cn", "aff_unique_abbr": "CAS", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "VMqg1CeUQP", "title": "DexCatch: Learning to Catch Arbitrary Objects with Dexterous Hands", "track": "main", "status": "Poster", "tldr": "", "abstract": "Achieving human-like dexterous manipulation remains a crucial area of research in robotics. Current research focuses on improving the success rate of pick-and-place tasks. Compared with pick-and-place, throwing-catching behavior has the potential to increase the speed of transporting objects to their destination. However, dynamic dexterous manipulation poses a major challenge for stable control due to a large number of dynamic contacts. In this paper, we propose a Learning-based framework for Throwing-Catching tasks using dexterous hands (LTC). Our method, LTC, achieves a 73% success rate across 45 scenarios (diverse hand poses and objects), and the learned policies demonstrate strong zero-shot transfer performance on unseen objects. Additionally, in tasks where the object in hand faces sideways, an extremely unstable scenario due to the lack of support from the palm, all baselines fail, while our method still achieves a success rate of over 60%.", "keywords": "Reinforcement Learning;Dexterous Manipulation;System Stability", "primary_area": "", "supplementary_material": "/attachment/68b94ce45582f718c05dd0ffe57854df62abdddf.zip", "author": "Fengbo Lan;Shengjie Wang;Yunzhe Zhang;Haotian Xu;Oluwatosin OluwaPelumi Oseni;Ziye Zhang;Yang Gao;Tao Zhang", "authorids": "~Fengbo_Lan2;~Shengjie_Wang2;~Yunzhe_Zhang3;~Haotian_Xu6;~Oluwatosin_OluwaPelumi_Oseni1;~Ziye_Zhang2;~Yang_Gao1;~Tao_Zhang9", "gender": ";M;;M;M;F;M;", "homepage": ";https://shengjiewang-jason.github.io/;;;https://tohsin.github.io/;;http://yang-gao.weebly.com;", "dblp": ";;;;;;89/4402-29;", "google_scholar": ";;;krfIJPUAAAAJ;;;https://scholar.google.com/citations?hl=en;", "orcid": ";;;;;0009-0004-5625-3178;;", "linkedin": ";;;;;;yang-gao-45245348/;", "or_profile": "~Fengbo_Lan2;~Shengjie_Wang2;~Yunzhe_Zhang3;~Haotian_Xu6;~Oluwatosin_OluwaPelumi_Oseni1;~Ziye_Zhang2;~Yang_Gao1;~Tao_Zhang9", "aff": ";Tsinghua University;;Tsinghua University;;Tsinghua University;Tsinghua University;", "aff_domain": ";mails.tsinghua.edu.cn;;mails.tsinghua.edu.cn;;mails.tsinghua.edu.cn;tsinghua.edu.cn;", "position": ";PhD student;;MS student;;Undergrad student;Assistant Professor;", "bibtex": "@inproceedings{\nlan2024dexcatch,\ntitle={DexCatch: Learning to Catch Arbitrary Objects with Dexterous Hands},\nauthor={Fengbo Lan and Shengjie Wang and Yunzhe Zhang and Haotian Xu and Oluwatosin OluwaPelumi Oseni and Ziye Zhang and Yang Gao and Tao Zhang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VMqg1CeUQP}\n}", "github": "", "project": "", "reviewers": "RD6F;2Jip;JPgA", "site": "https://openreview.net/forum?id=VMqg1CeUQP", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9438659056445984024&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China" }, { "id": "VUhlMfEekm", "title": "Implicit Grasp Diffusion: Bridging the Gap between Dense Prediction and Sampling-based Grasping", "track": "main", "status": "Poster", "tldr": "", "abstract": "There are two dominant approaches in modern robot grasp planning: dense prediction and sampling-based methods. Dense prediction calculates viable grasps across the robot\u2019s view but is limited to predicting one grasp per voxel. Sampling-based methods, on the other hand, encode multi-modal grasp distributions, allowing for different grasp approaches at a point. However, these methods rely on a global latent representation, which struggles to represent the entire field of view, resulting in coarse grasps. To address this, we introduce \\emph{Implicit Grasp Diffusion} (IGD), which combines the strengths of both methods by using implicit neural representations to extract detailed local features and sampling grasps from diffusion models conditioned on these features. Evaluations on clutter removal tasks in both simulated and real-world environments show that IGD delivers high accuracy, noise resilience, and multi-modal grasp pose capabilities.", "keywords": "Grasping;Implicit Neural Representations;Diffusion Models", "primary_area": "", "supplementary_material": "/attachment/351916e6bf36ac3e7d6bb29a0092d1c2440ffc18.zip", "author": "Pinhao Song;Pengteng Li;Renaud Detry", "authorids": "~Pinhao_Song1;2110276192@email.szu.edu.cn;renaud.detry@kuleuven.be", "gender": "M;;", "homepage": ";;", "dblp": ";;", "google_scholar": "pgD4ZGgAAAAJ;;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Pinhao_Song1;2110276192@email.szu.edu.cn;renaud.detry@kuleuven.be", "aff": "KU Leuven;;", "aff_domain": "kuleuven.be;;", "position": "PhD student;;", "bibtex": "@inproceedings{\nsong2024implicit,\ntitle={Implicit Grasp Diffusion: Bridging the Gap between Dense Prediction and Sampling-based Grasping},\nauthor={Pinhao Song and Pengteng Li and Renaud Detry},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VUhlMfEekm}\n}", "github": "https://gitlab.kuleuven.be/detry-lab/public/implicit-grasp-diffusion.git", "project": "", "reviewers": "TcPf;bjQA;cuMq", "site": "https://openreview.net/forum?id=VUhlMfEekm", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;5", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13181044419282460318&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Katholieke Universiteit Leuven", "aff_unique_dep": "", "aff_unique_url": "https://www.kuleuven.be", "aff_unique_abbr": "KU Leuven", "aff_country_unique_index": "0", "aff_country_unique": "Belgium" }, { "id": "VdyIhsh1jU", "title": "Legolas: Deep Leg-Inertial Odometry", "track": "main", "status": "Poster", "tldr": "", "abstract": "Estimating odometry, where an accumulating position and rotation is tracked, has critical applications in many areas of robotics as a form of state estimation such as in SLAM, navigation, and controls. During deployment of a legged robot, a vision system's tracking can easily get lost. Instead, using only the onboard leg and inertial sensor for odometry is a promising alternative. Previous methods in estimating leg-inertial odometry require analytical modeling or collecting high-quality real-world trajectories to train a model. Analytical modeling is specific to each robot, requires manual fine-tuning, and doesn't always capture real-world phenomena such as slippage. Previous work learning legged odometry still relies on collecting real-world data, this has been shown to not perform well out of distribution. In this work, we show that it is possible to estimate the odometry of a legged robot without any analytical modeling or real-world data collection. In this paper, we present Legolas, the first method that accurately estimates odometry in a purely data-driven fashion for quadruped robots. We deploy our method on two real-world quadruped robots in both indoor and outdoor environments. In the indoor scenes, our proposed method accomplishes a relative pose error that is 73% less than an analytical filtering-based approach and 87.5% less than a real-world behavioral cloning approach.\nMore results are available at: learned-odom.github.io", "keywords": "State and Odometry Estimation;Quadruped robots;Sim-to-Real", "primary_area": "", "supplementary_material": "/attachment/8af37d399ad7a121b384ab1a0065d19a28729521.zip", "author": "Justin Wasserman;Ananye Agarwal;Rishabh Jangir;Girish Chowdhary;Deepak Pathak;Abhinav Gupta", "authorids": "~Justin_Wasserman1;~Ananye_Agarwal1;~Rishabh_Jangir1;~Girish_Chowdhary1;~Deepak_Pathak1;~Abhinav_Gupta1", "gender": ";M;M;M;M;M", "homepage": "https://jbwasse2.github.io/;https://anag.me/;https://jangirrishabh.github.io/;http://www.daslab.illinois.edu;https://www.cs.cmu.edu/~dpathak/;http://www.cs.cmu.edu/~abhinavg", "dblp": ";294/4812;;09/5775;155/9860;36/7024-1", "google_scholar": "6OP2QCkAAAAJ;https://scholar.google.com/citations?hl=en;UFokX9EAAAAJ;pf2zAXkAAAAJ;https://scholar.google.cl/citations?user=AEsPCAUAAAAJ;https://scholar.google.com.tw/citations?user=bqL73OkAAAAJ", "orcid": ";;;;;", "linkedin": ";;rishabh-jangir-74b1929b/;girishchowdhary/;pathak22/;", "or_profile": "~Justin_Wasserman1;~Ananye_Agarwal1;~Rishabh_Jangir1;~Girish_Chowdhary1;~Deepak_Pathak1;~Abhinav_Gupta1", "aff": "University of Illinois, Urbana Champaign;Carnegie Mellon University;University of California, San Diego;University of Illinois, Urbana Champaign;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "illinois.edu;cmu.edu;ucsd.edu;illinois.edu;cmu.edu;cmu.edu", "position": "PhD student;PhD student;MS student;Associate Professor;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\nwasserman2024legolas,\ntitle={Legolas: Deep Leg-Inertial Odometry},\nauthor={Justin Wasserman and Ananye Agarwal and Rishabh Jangir and Girish Chowdhary and Deepak Pathak and Abhinav Gupta},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VdyIhsh1jU}\n}", "github": "", "project": "", "reviewers": "KiPa;8eXK;hmFN", "site": "https://openreview.net/forum?id=VdyIhsh1jU", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;5", "rating_avg": 2.6666666666666665, "confidence_avg": 5.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3998700927440257370&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;0;1;1", "aff_unique_norm": "University of Illinois Urbana-Champaign;Carnegie Mellon University;University of California, San Diego", "aff_unique_dep": ";;", "aff_unique_url": "https://illinois.edu;https://www.cmu.edu;https://www.ucsd.edu", "aff_unique_abbr": "UIUC;CMU;UCSD", "aff_campus_unique_index": "0;2;0", "aff_campus_unique": "Urbana-Champaign;;San Diego", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "VoC3wF6fbh", "title": "Learning to Open and Traverse Doors with a Legged Manipulator", "track": "main", "status": "Poster", "tldr": "", "abstract": "Using doors is a longstanding challenge in robotics and is of significant practical interest in giving robots greater access to human-centric spaces. The task is challenging due to the need for online adaptation to varying door properties and precise control in manipulating the door panel and navigating through the confined doorway. To address this, we propose a learning-based controller for a legged manipulator to open and traverse through doors. The controller is trained using a teacher-student approach in simulation to learn robust task behaviors as well as estimate crucial door properties during the interaction. Unlike previous works, our approach is a single control policy that can handle both push and pull doors through learned behaviour which infers the opening direction during deployment without prior knowledge. The policy was deployed on the ANYmal legged robot with an arm and achieved a success rate of 95.0% in repeated trials conducted in an experimental setting. Additional experiments validate the policy's effectiveness and robustness to various doors and disturbances. A video overview of the method and experiments is provided in the supplementary material.", "keywords": "Mobile Manipulation;Legged Manipulator;Reinforcement Learning;Door Opening", "primary_area": "", "supplementary_material": "/attachment/65331ae955a63b5aefbe0e3d62c09a5eaadcd68a.zip", "author": "Mike Zhang;Yuntao Ma;Takahiro Miki;Marco Hutter", "authorids": "~Mike_Zhang2;~Yuntao_Ma2;~Takahiro_Miki1;~Marco_Hutter1", "gender": ";M;M;M", "homepage": "https://clams-casino.github.io/;https://articuno144.github.io/;;http://www.rsl.ethz.ch", "dblp": ";;;04/2753", "google_scholar": ";;nOl83tYAAAAJ;https://scholar.google.ch/citations?user=DO3quJYAAAAJ", "orcid": ";;;0000-0002-4285-4990", "linkedin": ";;;", "or_profile": "~Mike_Zhang2;~Yuntao_Ma2;~Takahiro_Miki1;~Marco_Hutter1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;;ethz.ch", "position": "PhD student;PhD student;;Associate Professor", "bibtex": "@inproceedings{\nzhang2024learning,\ntitle={Learning to Open and Traverse Doors with a Legged Manipulator},\nauthor={Mike Zhang and Yuntao Ma and Takahiro Miki and Marco Hutter},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=VoC3wF6fbh}\n}", "github": "", "project": "", "reviewers": "AGnv;wfP1;26YG", "site": "https://openreview.net/forum?id=VoC3wF6fbh", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;5;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=199569861702637461&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "WLOTZHmmO6", "title": "Let Occ Flow: Self-Supervised 3D Occupancy Flow Prediction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Accurate perception of the dynamic environment is a fundamental task for autonomous driving and robot systems. This paper introduces Let Occ Flow, the first self-supervised work for joint 3D occupancy and occupancy flow prediction using only camera inputs, eliminating the need for 3D annotations. Utilizing TPV for unified scene representation and deformable attention layers for feature aggregation, our approach incorporates a novel attention-based temporal fusion module to capture dynamic object dependencies, followed by a 3D refine module for fine-gained volumetric representation. Besides, our method extends differentiable rendering to 3D volumetric flow fields, leveraging zero-shot 2D segmentation and optical flow cues for dynamic decomposition and motion optimization. Extensive experiments on nuScenes and KITTI datasets demonstrate the competitive performance of our approach over prior state-of-the-art methods.", "keywords": "3D occupancy prediction;occupancy flow;Neural Radiance Field", "primary_area": "", "supplementary_material": "/attachment/fe04462d254fa8c923da9a0b389ea8cbc8065e92.zip", "author": "Yili Liu;Linzhan Mou;Xuan Yu;Chenrui Han;Sitong Mao;Rong Xiong;Yue Wang", "authorids": "~Yili_Liu1;~Linzhan_Mou1;~Xuan_Yu1;~Chenrui_Han1;~Sitong_Mao1;~Rong_Xiong1;~Yue_Wang1", "gender": "M;;F;M;;;M", "homepage": ";;;https://github.com/Zero314159;;;https://ywang-zju.github.io/", "dblp": ";;;;204/0152;;", "google_scholar": "pBEZ7V4AAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;https://scholar.google.com.hk/citations?user=RiX5SJUAAAAJ;1hI9bqUAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Yili_Liu1;~Linzhan_Mou1;~Xuan_Yu1;~Chenrui_Han1;~Sitong_Mao1;~Rong_Xiong1;~Yue_Wang1", "aff": "Zhejiang University;;Zhejiang University;Zhejiang University;The Hong Kong Polytechnic University;Zhejiang University;Zhejiang University", "aff_domain": "zju.edu.cn;;zju.edu.cn;zju.edu.cn;polyu.edu.hk;zju.edu.cn;zju.edu.cn", "position": "MS student;;PhD student;MS student;PhD student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024let,\ntitle={Let Occ Flow: Self-Supervised 3D Occupancy Flow Prediction},\nauthor={Yili Liu and Linzhan Mou and Xuan Yu and Chenrui Han and Sitong Mao and Rong Xiong and Yue Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WLOTZHmmO6}\n}", "github": "https://github.com/eliliu2233/occ-flow", "project": "", "reviewers": "PYze;Dnni;hWju", "site": "https://openreview.net/forum?id=WLOTZHmmO6", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14711942367342004856&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Zhejiang University;Hong Kong Polytechnic University", "aff_unique_dep": ";", "aff_unique_url": "https://www.zju.edu.cn;https://www.polyu.edu.hk", "aff_unique_abbr": "ZJU;PolyU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "WjDR48cL3O", "title": "Continuous Control with Coarse-to-fine Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Despite recent advances in improving the sample-efficiency of reinforcement learning (RL) algorithms, designing an RL algorithm that can be practically deployed in real-world environments remains a challenge. In this paper, we present Coarse-to-fine Reinforcement Learning (CRL), a framework that trains RL agents to zoom-into a continuous action space in a coarse-to-fine manner, enabling the use of stable, sample-efficient value-based RL algorithms for fine-grained continuous control tasks. Our key idea is to train agents that output actions by iterating the procedure of (i) discretizing the continuous action space into multiple intervals and (ii) selecting the interval with the highest Q-value to further discretize at the next level. We then introduce a concrete, value-based algorithm within the CRL framework called Coarse-to-fine Q-Network (CQN). Our experiments demonstrate that CQN significantly outperforms RL and behavior cloning baselines on 20 sparsely-rewarded RLBench manipulation tasks with a modest number of environment interactions and expert demonstrations. We also show that CQN robustly learns to solve real-world manipulation tasks within a few minutes of online training.", "keywords": "Reinforcement Learning;Sample-Efficient;Action Discretization", "primary_area": "", "supplementary_material": "/attachment/8269450a38625e4e856bd7e24888ae7aa77466b9.zip", "author": "Younggyo Seo;Jafar Uru\u00e7;Stephen James", "authorids": "~Younggyo_Seo1;~Jafar_Uru\u00e71;~Stephen_James1", "gender": "M;M;M", "homepage": "https://younggyo.me/;https://github.com/JafarAbdi/;https://stepjam.github.io/", "dblp": "265/5586;;163/5669", "google_scholar": "tI1-YwIAAAAJ;;OXtG-isAAAAJ", "orcid": ";;", "linkedin": ";jafar-uruc/;", "or_profile": "~Younggyo_Seo1;~Jafar_Uru\u00e71;~Stephen_James1", "aff": "Dyson;London Dyson Robot Learning Lab;Dyson", "aff_domain": "dyson.com;dyson.com;dyson.com", "position": "Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nseo2024continuous,\ntitle={Continuous Control with Coarse-to-fine Reinforcement Learning},\nauthor={Younggyo Seo and Jafar Uru{\\c{c}} and Stephen James},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WjDR48cL3O}\n}", "github": "https://github.com/younggyoseo/CQN", "project": "", "reviewers": "qKY2;D3eS;4fBs", "site": "https://openreview.net/forum?id=WjDR48cL3O", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2441016515703928715&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Dyson", "aff_unique_dep": "", "aff_unique_url": "https://www.dyson.com", "aff_unique_abbr": "", "aff_campus_unique_index": "1", "aff_campus_unique": ";London", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "WmWbswjTsi", "title": "Cloth-Splatting: 3D Cloth State Estimation from RGB Supervision", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce Cloth-Splatting, a method for estimating 3D states of cloth from RGB images through a prediction-update framework. Cloth-Splatting leverages an action-conditioned dynamics model for predicting future states and uses 3D Gaussian Splatting to update the predicted states. Our key insight is that coupling a 3D mesh-based representation with Gaussian Splatting allows us to define a differentiable map between the cloth's state space and the image space. This enables the use of gradient-based optimization techniques to refine inaccurate state estimates using only RGB supervision. Our experiments demonstrate that Cloth-Splatting not only improves state estimation accuracy over current baselines but also reduces convergence time by $\\sim 85$ \\%.", "keywords": "3D State Representations;Gaussian Splatting;Deformable Objects;Vision-based Tracking", "primary_area": "", "supplementary_material": "/attachment/e224dee7107eba7202f8f2820f3c96127ac66071.zip", "author": "Alberta Longhini;Marcel B\u00fcsching;Bardienus Pieter Duisterhof;Jens Lundell;Jeffrey Ichnowski;M\u00e5rten Bj\u00f6rkman;Danica Kragic", "authorids": "~Alberta_Longhini1;~Marcel_B\u00fcsching1;~Bardienus_Pieter_Duisterhof1;~Jens_Lundell1;~Jeffrey_Ichnowski1;~M\u00e5rten_Bj\u00f6rkman2;~Danica_Kragic1", "gender": "F;M;M;M;M;F;M", "homepage": "https://albilo17.github.io/;https://buesma.github.io/;https://bart-ai.com;https://jenslundell.ai/;https://ichnow.ski;http://www.csc.kth.se/~danik;https://www.kth.se/profile/celle", "dblp": ";348/9264;243/5766;;89/1741;82/1211;", "google_scholar": "gwFVvsQAAAAJ;9mc2T9IAAAAJ;LLsYMFYAAAAJ;OPZZRVsAAAAJ;1OdtfywAAAAJ;;https://scholar.google.se/citations?user=jKjp9h4AAAAJ", "orcid": ";0000-0001-9296-9166;;;0000-0003-4874-9478;;", "linkedin": ";marcel-buesching/;;;;;", "or_profile": "~Alberta_Longhini1;~Marcel_B\u00fcsching1;~Bardienus_Pieter_Duisterhof1;~Jens_Lundell1;~Jeffrey_Ichnowski1;~Danica_Kragic1;~Marten_Bjoerkman1", "aff": "KTH Royal Institute of Technology;KTH Royal Institute of Technology;Naver Labs Europe;KTH Royal Institute of Technology;Carnegie Mellon University;KTH;KTH Royal Institute of Technology, Stockholm, Sweden", "aff_domain": "kth.se;kth.se;naverlabs.com;kth.se;cmu.edu;kth.se;kth.se", "position": "PhD student;PhD student;Intern;Postdoc;Assistant Professor;Professor;Associate Professor", "bibtex": "@inproceedings{\nlonghini2024clothsplatting,\ntitle={Cloth-Splatting: 3D Cloth State Estimation from {RGB} Supervision},\nauthor={Alberta Longhini and Marcel B{\\\"u}sching and Bardienus Pieter Duisterhof and Jens Lundell and Jeffrey Ichnowski and M{\\r{a}}rten Bj{\\\"o}rkman and Danica Kragic},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WmWbswjTsi}\n}", "github": "", "project": "", "reviewers": "wB9e;eUkr;FCbJ", "site": "https://openreview.net/forum?id=WmWbswjTsi", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=18138468588181072645&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;0;2;0;0", "aff_unique_norm": "KTH Royal Institute of Technology;NAVER LABS;Carnegie Mellon University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.kth.se;https://labs.naver.com;https://www.cmu.edu", "aff_unique_abbr": "KTH;NLE;CMU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stockholm", "aff_country_unique_index": "0;0;1;0;2;0;0", "aff_country_unique": "Sweden;Unknown;United States" }, { "id": "WnSl42M9Z4", "title": "HumanPlus: Humanoid Shadowing and Imitation from Humans", "track": "main", "status": "Poster", "tldr": "", "abstract": "One of the key arguments for building robots that have similar form factors to human beings is that we can leverage the massive human data for training.Yet, doing so has remained challenging in practice due to the complexities in humanoid perception and control, lingering physical gaps between humanoids and humans in morphologies and actuation, and lack of a data pipeline for humanoids to learn autonomous skills from egocentric vision. In this paper, we introduce a full-stack system for humanoids to learn motion and autonomous skills from human data. We first train a low-level policy in simulation via reinforcement learning using existing 40-hour human motion datasets. This policy transfers to the real world and allows humanoid robots to follow human body and hand motion in real time using only a RGB camera, i.e. shadowing. Through shadowing, human operators can teleoperate humanoids to collect whole-body data for learning different tasks in the real world. Using the data collected, we then perform supervised behavior cloning to train skill policies using egocentric vision, allowing humanoids to complete different tasks autonomously by imitating human skills. We demonstrate the system on our customized 33-DoF 180cm humanoid, autonomously completing tasks such as wearing a shoe to stand up and walk, folding a sweatshirt, rearranging objects, typing, and greeting another robot with 60-100% success rates using up to 40 demonstrations.", "keywords": "Humanoids;Learning from Human Data;Whole-Body Control", "primary_area": "", "supplementary_material": "/attachment/0ab441c33274734f0ff34007aebc7603b4c6b0ec.zip", "author": "Zipeng Fu;Qingqing Zhao;Qi Wu;Gordon Wetzstein;Chelsea Finn", "authorids": "~Zipeng_Fu1;~Qingqing_Zhao1;wuqi23@stanford.edu;~Gordon_Wetzstein3;~Chelsea_Finn1", "gender": "M;F;;M;F", "homepage": "https://zipengfu.github.io;https://qingqing-zhao.github.io/;;http://web.stanford.edu/~gordonwz/;https://ai.stanford.edu/~cbfinn/", "dblp": "245/1504;;;13/4660;131/1783", "google_scholar": "wMcPTbEAAAAJ;https://scholar.google.com/citations?hl=en;;VOf45S0AAAAJ;vfPE6hgAAAAJ", "orcid": ";;;0000-0002-9243-6885;", "linkedin": "zipengfu;;;gordon-wetzstein-2406723/;", "or_profile": "~Zipeng_Fu1;~Qingqing_Zhao1;wuqi23@stanford.edu;~Gordon_Wetzstein3;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;;Stanford University;Google", "aff_domain": "stanford.edu;stanford.edu;;stanford.edu;google.com", "position": "PhD student;PhD student;;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nfu2024humanplus,\ntitle={HumanPlus: Humanoid Shadowing and Imitation from Humans},\nauthor={Zipeng Fu and Qingqing Zhao and Qi Wu and Gordon Wetzstein and Chelsea Finn},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=WnSl42M9Z4}\n}", "github": "https://github.com/MarkFzp/humanplus", "project": "", "reviewers": "uR2D;mL5C;ycZ6", "site": "https://openreview.net/forum?id=WnSl42M9Z4", "pdf_size": 0, "rating": "4;4;4", "confidence": "4;4;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 109, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13643739193287635334&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "X3OfR3axX4", "title": "Multi-Transmotion: Pre-trained Model for Human Motion Prediction", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability of intelligent systems to predict human behaviors is essential, particularly in fields such as autonomous vehicle navigation and social robotics. However, the intricacies of human motion have precluded the development of a standardized dataset and model for human motion prediction, thereby hindering the establishment of pre-trained models. In this paper, we address these limitations by integrating multiple datasets, encompassing both trajectory and 3D pose keypoints, to further propose a pre-trained model for human motion prediction. We merge seven distinct datasets across varying modalities and standardize their formats. To facilitate multimodal pre-training, we introduce Multi-Transmotion, an innovative transformer-based model capable of cross-modality pre-training. Additionally, we devise a novel masking strategy to learn rich representations. Our methodology demonstrates competitive performance across various datasets on several downstream tasks, including trajectory prediction in the NBA and JTA datasets, as well as pose prediction in the AMASS and 3DPW datasets. The code will be made available upon publication.", "keywords": "Human motion prediction;Pre-training;Transformer", "primary_area": "", "supplementary_material": "", "author": "Yang Gao;Po-Chien Luan;Alexandre Alahi", "authorids": "~Yang_Gao15;~Po-Chien_Luan1;~Alexandre_Alahi3", "gender": "M;M;M", "homepage": ";;https://vita.epfl.ch/", "dblp": ";;48/3455", "google_scholar": ";Y2Oth4MAAAAJ;UIhXQ64AAAAJ", "orcid": "0000-0002-3695-9155;;", "linkedin": "yang-gao-525910248/;;", "or_profile": "~Yang_Gao15;~Po-Chien_Luan1;~Alexandre_Alahi3", "aff": "EPFL - EPF Lausanne;EPFL - EPF Lausanne;EPFL", "aff_domain": "epfl.ch;epfl.ch;epfl.ch", "position": "PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\ngao2024multitransmotion,\ntitle={Multi-Transmotion: Pre-trained Model for Human Motion Prediction},\nauthor={Yang Gao and Po-Chien Luan and Alexandre Alahi},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=X3OfR3axX4}\n}", "github": "https://github.com/vita-epfl/multi-transmotion", "project": "", "reviewers": "ePRe;6LnL;4Ldu;CsuV", "site": "https://openreview.net/forum?id=X3OfR3axX4", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "4;5;3;3", "rating_avg": 2.75, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": -0.17407765595569782, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7172147342397299943&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "EPFL", "aff_unique_dep": "", "aff_unique_url": "https://www.epfl.ch", "aff_unique_abbr": "EPFL", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Lausanne;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "XopATjibyz", "title": "Learning Quadruped Locomotion Using Differentiable Simulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "This work explores the potential of using differentiable simulation for learning robot control. Differentiable simulation promises fast convergence and stable training by computing low-variance first-order gradients using the robot model. Still, so far, its usage for legged robots is limited to simulation. The main challenge lies in the complex optimization landscape of robotic tasks due to discontinuous dynamics. This work proposes a new differentiable simulation framework to overcome these challenges. The key idea involves decoupling the complex whole-body simulation, which may exhibit discontinuities due to contact into two separate continuous domains. Subsequently, we align the robot state resulting from the simplified model with a more precise, non-differentiable simulator to maintain sufficient simulation accuracy. Our framework enables learning quadruped walking in simulation in minutes without parallelization. When augmented with GPU parallelization, our approach allows the quadruped robot to master diverse locomotion skills on challenging terrains in minutes. We demonstrate that differentiable simulation outperforms a reinforcement \nlearning algorithm (PPO) by achieving significantly better sample efficiency while maintaining its effectiveness in handling large-scale environments. Our policy achieves robust locomotion performance in the real world zero-shot.", "keywords": "Differentiable Simulation;Legged Locomotion;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/dfba68b8975b51b8548b9307de0a1323234681bb.zip", "author": "Yunlong Song;Sang bae Kim;Davide Scaramuzza", "authorids": "~Yunlong_Song1;~Sang_bae_Kim1;~Davide_Scaramuzza1", "gender": "M;M;", "homepage": "https://yun-long.github.io/;https://biomimetics.mit.edu/;", "dblp": ";;", "google_scholar": "EzAXL9QAAAAJ;;", "orcid": "0000-0002-6352-3744;;", "linkedin": "yunlong-song-a80baa124/;;", "or_profile": "~Yunlong_Song1;~Sang_bae_Kim1;~Davide_Scaramuzza1", "aff": ";Massachusetts Institute of Technology;", "aff_domain": ";mit.edu;", "position": ";Full Professor;", "bibtex": "@inproceedings{\nsong2024learning,\ntitle={Learning Quadruped Locomotion Using Differentiable Simulation},\nauthor={Yunlong Song and Sang bae Kim and Davide Scaramuzza},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XopATjibyz}\n}", "github": "", "project": "", "reviewers": "ersp;VLv4;Mbex", "site": "https://openreview.net/forum?id=XopATjibyz", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 15, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2266984770706163824&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_country_unique_index": "0", "aff_country_unique": "United States" }, { "id": "XrxLGzF0lJ", "title": "So You Think You Can Scale Up Autonomous Robot Data Collection?", "track": "main", "status": "Poster", "tldr": "", "abstract": "A long-standing goal in robot learning is to develop methods for robots to acquire new skills autonomously. While reinforcement learning (RL) comes with the promise of enabling autonomous data collection, it remains challenging to scale in the real-world partly due to the significant effort required for environment design and instrumentation, including the need for designing reset functions or accurate success detectors. On the other hand, imitation learning (IL) methods require little to no environment design effort, but instead require significant human supervision in the form of collected demonstrations. To address these shortcomings, recent works in autonomous IL start with an initial seed dataset of human demonstrations that an autonomous policy can bootstrap from. While autonomous IL approaches come with the promise of addressing the challenges of autonomous RL\u2014environment design challenges\u2014as well as the challenges of pure IL strategies\u2014extensive human supervision\u2014in this work, we posit that such techniques do not deliver on this promise and are still unable to scale up autonomous data collection in the real world. Through a series of targeted real-world experiments, we demonstrate that these approaches, when scaled up to realistic settings, face much of the same scaling challenges as prior attempts in RL in terms of environment design. Further, we perform a rigorous study of various autonomous IL methods across different data scales and 7 simulation and real-world tasks, and demonstrate that while autonomous data collection can modestly improve performance (on the order of 10%), simply collecting more human data often provides significantly more improvement. Our work suggests a negative result: that scaling up autonomous data collection for learning robot policies for real-world tasks is more challenging and impractical than what is suggested in prior work. We hope these insights about the core challenges of scaling up data collection help inform future efforts in autonomous learning.", "keywords": "autonomous data collection;imitation learning", "primary_area": "", "supplementary_material": "/attachment/0b665ea51eb7feec6726391b25be922f6b936957.zip", "author": "Suvir Mirchandani;Suneel Belkhale;Joey Hejna;Evelyn Choi;Md Sazzad Islam;Dorsa Sadigh", "authorids": "~Suvir_Mirchandani1;~Suneel_Belkhale1;~Joey_Hejna1;~Evelyn_Choi1;~Md_Sazzad_Islam1;~Dorsa_Sadigh1", "gender": "M;M;F;M;F;M", "homepage": "http://suvirpmirchandani.com;https://github.com/suneelbelkhale;;;https://dorsa.fyi/;https://joeyhejna.com", "dblp": "287/4981;236/5069;;;117/3174;336/3297", "google_scholar": "fz7LJPIAAAAJ;;;;ZaJEZpYAAAAJ;y_sLoXoAAAAJ", "orcid": ";0000-0002-3963-7987;;0009-0002-6512-7419;;", "linkedin": ";suneel-b-032b1a101/;evelyn-choi-176508192;sazzad14/;;", "or_profile": "~Suvir_Mirchandani1;~Suneel_Belkhale1;~Evelyn_Choi1;~Md_Sazzad_Islam1;~Dorsa_Sadigh1;~Donald_Joseph_Hejna_III1", "aff": "Stanford University;Stanford University;Stanford University;Stanford University;Stanford University;Google", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu;google.com", "position": "PhD student;PhD student;Undergrad student;Undergrad student;Assistant Professor;Intern", "bibtex": "@inproceedings{\nmirchandani2024so,\ntitle={So You Think You Can Scale Up Autonomous Robot Data Collection?},\nauthor={Suvir Mirchandani and Suneel Belkhale and Joey Hejna and Evelyn Choi and Md Sazzad Islam and Dorsa Sadigh},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=XrxLGzF0lJ}\n}", "github": "", "project": "", "reviewers": "wA2S;BvKu;7gqs", "site": "https://openreview.net/forum?id=XrxLGzF0lJ", "pdf_size": 0, "rating": "3;3;3", "confidence": "2;3;4", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5465296601030280047&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.stanford.edu;https://www.google.com", "aff_unique_abbr": "Stanford;Google", "aff_campus_unique_index": "0;0;0;0;0;1", "aff_campus_unique": "Stanford;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "YOFrRTDC6d", "title": "SkillMimicGen: Automated Demonstration Generation for Efficient Skill Learning and Deployment", "track": "main", "status": "Poster", "tldr": "", "abstract": "Imitation learning from human demonstrations is an effective paradigm for robot manipulation, but acquiring large datasets is costly and resource-intensive, especially for long-horizon tasks. To address this issue, we propose SkillGen, an automated system for generating demonstration datasets from a few human demos. SkillGen segments human demos into manipulation skills, adapts these skills to new contexts, and stitches them together through free-space transit and transfer motion. We also propose a Hybrid Skill Policy (HSP) framework for learning skill initiation, control, and termination components from SkillGen datasets, enabling skills to be sequenced using motion planning at test-time. We demonstrate that SkillGen greatly improves data generation and policy learning performance over a state-of-the-art data generation framework, resulting in the capability produce data for large scene variations, including clutter, and agents that are on average 24% more successful. We demonstrate the efficacy of SkillGen by generating over 24K demonstrations across 18 task variants in simulation from just 60 human demonstrations, and training proficient, often near-perfect, HSP agents. Finally, we apply SkillGen to 3 real-world manipulation tasks and demonstrate zero-shot sim-to-real transfer on a long-horizon assembly task. Videos, and more at https://skillgen.github.io.", "keywords": "Imitation Learning;Manipulation;Planning", "primary_area": "", "supplementary_material": "/attachment/c42e45e9f6e1a333eb88563c91536165d152d80e.zip", "author": "Caelan Reed Garrett;Ajay Mandlekar;Bowen Wen;Dieter Fox", "authorids": "~Caelan_Reed_Garrett1;~Ajay_Mandlekar1;~Bowen_Wen1;~Dieter_Fox1", "gender": "M;M;;M", "homepage": "http://web.mit.edu/caelan/www/;https://ai.stanford.edu/~amandlek/;https://wenbowen123.github.io/;https://homes.cs.washington.edu/~fox/", "dblp": "161/9727;https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay;;f/DieterFox", "google_scholar": "KVUCqGwAAAAJ;MEz23joAAAAJ;VSG7Z0kAAAAJ;DqXsbPAAAAAJ", "orcid": "0000-0002-6474-1276;;;", "linkedin": "caelan-garrett-85197977/;;bowen-wen/;", "or_profile": "~Caelan_Reed_Garrett1;~Ajay_Mandlekar1;~Bowen_Wen1;~Dieter_Fox1", "aff": "NVIDIA;NVIDIA;NVIDIA;Department of Computer Science", "aff_domain": "nvidia.com;nvidia.com;nvidia.com;cs.washington.edu", "position": "Researcher;Researcher;Researcher;Full Professor", "bibtex": "@inproceedings{\ngarrett2024skillmimicgen,\ntitle={SkillMimicGen: Automated Demonstration Generation for Efficient Skill Learning and Deployment},\nauthor={Caelan Reed Garrett and Ajay Mandlekar and Bowen Wen and Dieter Fox},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=YOFrRTDC6d}\n}", "github": "", "project": "", "reviewers": "Gd22;LdNH;cZDT", "site": "https://openreview.net/forum?id=YOFrRTDC6d", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5279091942005563154&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "NVIDIA;Unknown Institution", "aff_unique_dep": "NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.nvidia.com;", "aff_unique_abbr": "NVIDIA;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "Yce2jeILGt", "title": "Open-TeleVision: Teleoperation with Immersive Active Visual Feedback", "track": "main", "status": "Poster", "tldr": "", "abstract": "Teleoperation serves as a powerful method for collecting on-robot data essential for robot learning from demonstrations. The intuitiveness and ease of use of the teleoperation system are crucial for ensuring high-quality, diverse, and scalable data. To achieve this, we propose an immersive teleoperation system $\\textbf{Open-TeleVision}$ that allows operators to actively perceive the robot's surroundings in a stereoscopic manner. Additionally, the system mirrors the operator's arm and hand movements on the robot, creating an immersive experience as if the operator's mind is transmitted to a robot embodiment. We validate the effectiveness of our system by collecting data and training imitation learning policies on four long-horizon, precise tasks (can sorting, can insertion, folding, and unloading) for 2 different humanoid robots and deploy them in the real world. The entire system will be open-sourced.", "keywords": "Teleoperation;VR/AR;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/63b2250c42c34db55237cf049dc8c5e054140db1.zip", "author": "Xuxin Cheng;Jialong Li;Shiqi Yang;Ge Yang;Xiaolong Wang", "authorids": "~Xuxin_Cheng2;~Jialong_Li3;~Shiqi_Yang2;~Ge_Yang1;~Xiaolong_Wang3", "gender": "M;M;M;M;M", "homepage": "https://chengxuxin.github.io;https://rexskywalkerlee.github.io/;https://aaronyang1223.github.io/;http://www.episodeyang.com;https://xiaolonw.github.io/", "dblp": ";;;48/4561-3;91/952-4", "google_scholar": "Z8vhOxYAAAAJ;;OQQzJb4AAAAJ;vaQcF6kAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;0009-0009-8529-4522;0000-0001-7520-7055;", "linkedin": ";jialong-li-737a561a8/;;;", "or_profile": "~Xuxin_Cheng2;~Jialong_Li3;~Shiqi_Yang2;~Ge_Yang1;~Xiaolong_Wang3", "aff": "University of California, San Diego;University of California, San Diego;University of California, San Diego;Massachusetts Institute of Technology;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;ucsd.edu;mit.edu;ucsd.edu", "position": "PhD student;MS student;MS student;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\ncheng2024opentelevision,\ntitle={Open-TeleVision: Teleoperation with Immersive Active Visual Feedback},\nauthor={Xuxin Cheng and Jialong Li and Shiqi Yang and Ge Yang and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yce2jeILGt}\n}", "github": "https://github.com/OpenTeleVision/TeleVision", "project": "", "reviewers": "QGk6;xknk;mH7J", "site": "https://openreview.net/forum?id=Yce2jeILGt", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 99, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11464578025569732339&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "University of California, San Diego;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://web.mit.edu", "aff_unique_abbr": "UCSD;MIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "San Diego;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "Yw5QGNBkEN", "title": "Scaling Manipulation Learning with Visual Kinematic Chain Prediction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning general-purpose models from diverse datasets has achieved great success in machine learning. In robotics, however, existing methods in multi-task learning are typically constrained to a single robot and workspace, while recent work such as RT-X requires a non-trivial action normalization procedure to manually bridge the gap between different action spaces in diverse environments. In this paper, we propose the visual kinematics chain as a precise and universal representation of quasi-static actions for robot learning over diverse environments, which requires no manual adjustment since the visual kinematic chains can be automatically obtained from the robot\u2019s model and camera parameters. We propose the Visual Kinematics Transformer (VKT), a convolution-free architecture that supports an arbitrary number of camera viewpoints, and that is trained with a single objective of forecasting kinematic structures through optimal point-set matching. We demonstrate the superior performance of VKT over BC transformers as a general agent on Calvin, RLBench, ALOHA, Open-X, and real robot manipulation tasks. Video demonstrations and source code can be found at https://mlzxy.github.io/visual-kinetic-chain.", "keywords": "Multi-Task Robot Learning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/57f505106ae306faf96a28e95a3ac0bdf8bffc2a.zip", "author": "Xinyu Zhang;Yuhan Liu;Haonan Chang;Abdeslam Boularias", "authorids": "~Xinyu_Zhang7;~Yuhan_Liu2;~Haonan_Chang1;~Abdeslam_Boularias1", "gender": "M;M;M;M", "homepage": "https://mlzxy.github.io/;;https://github.com/changhaonan;http://rl.cs.rutgers.edu/", "dblp": ";125/8141;;57/2269", "google_scholar": "M7hnG9oAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Xinyu_Zhang7;~Yuhan_Liu2;~Haonan_Chang1;~Abdeslam_Boularias1", "aff": "Rutgers University;Rutgers University;Rutgers, New Brunswick;, Rutgers University", "aff_domain": "rutgers.edu;rutgers.edu;scarletmail.rutgers.edu;cs.rutgers.edu", "position": "PhD student;PhD student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nzhang2024scaling,\ntitle={Scaling Manipulation Learning with Visual Kinematic Chain Prediction},\nauthor={Xinyu Zhang and Yuhan Liu and Haonan Chang and Abdeslam Boularias},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=Yw5QGNBkEN}\n}", "github": "", "project": "", "reviewers": "TuBj;t5uW;Lf1f", "site": "https://openreview.net/forum?id=Yw5QGNBkEN", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8945863786073513095&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Rutgers University", "aff_unique_dep": "", "aff_unique_url": "https://www.rutgers.edu", "aff_unique_abbr": "Rutgers", "aff_campus_unique_index": "1", "aff_campus_unique": ";New Brunswick", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "ZMnD6QZAE6", "title": "OpenVLA: An Open-Source Vision-Language-Action Model", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large policies pretrained on a combination of Internet-scale vision-language data and diverse robot demonstrations have the potential to change how we teach robots new skills: rather than training new behaviors from scratch, we can fine-tune such vision-language-action (VLA) models to obtain robust, generalizable policies for visuomotor control. Yet, widespread adoption of VLAs for robotics has been challenging as 1) existing VLAs are largely closed and inaccessible to the public, and 2) prior work fails to explore methods for efficiently fine-tuning VLAs for new tasks, a key component for adoption. Addressing these challenges, we introduce OpenVLA, a 7B-parameter open-source VLA trained on a diverse collection of 970k real-world robot demonstrations. OpenVLA builds on a Llama 2 language model combined with a visual encoder that fuses pretrained features from DINOv2 and SigLIP. As a product of the added data diversity and new model components, OpenVLA demonstrates strong results for generalist manipulation, outperforming closed models such as RT-2-X (55B) by 16.5\\% in absolute task success rate across 29 tasks and multiple robot embodiments, with 7x fewer parameters. We further show that we can effectively fine-tune OpenVLA for new settings, with especially strong generalization results in multi-task environments involving multiple objects and strong language grounding abilities, where we outperform expressive from-scratch imitation learning methods such as Diffusion Policy by 20.4\\%. We also explore compute efficiency; as a separate contribution, we show that OpenVLA can be fine-tuned on consumer GPUs via modern low-rank adaptation methods and served efficiently via quantization without a hit to downstream success rate. Finally, we release model checkpoints, fine-tuning notebooks, and our PyTorch codebase with built-in support for training VLAs at scale on Open X-Embodiment datasets.", "keywords": "Vision-Language-Action Models;Generalist Policies;Large-scale Robot Learning;Robotic Manipulation;Robotics;Vision-Language Models", "primary_area": "", "supplementary_material": "/attachment/1318377860aa083955bff5f25bdc309a2124d6b1.zip", "author": "Moo Jin Kim;Karl Pertsch;Siddharth Karamcheti;Ted Xiao;Ashwin Balakrishna;Suraj Nair;Rafael Rafailov;Ethan P Foster;Pannag R Sanketi;Quan Vuong;Thomas Kollar;Benjamin Burchfiel;Russ Tedrake;Dorsa Sadigh;Sergey Levine;Percy Liang;Chelsea Finn", "authorids": "~Moo_Jin_Kim1;~Karl_Pertsch1;~Siddharth_Karamcheti1;~Ted_Xiao1;~Ashwin_Balakrishna1;~Suraj_Nair1;~Rafael_Rafailov1;~Ethan_P_Foster1;~Pannag_R_Sanketi1;~Quan_Vuong2;~Thomas_Kollar1;~Benjamin_Burchfiel1;~Russ_Tedrake1;~Dorsa_Sadigh1;~Sergey_Levine1;~Percy_Liang1;~Chelsea_Finn1", "gender": "M;;M;M;M;M;M;;M;M;M;M;M;F;M;;F", "homepage": "https://moojink.com;https://kpertsch.github.io/;http://siddkaramcheti.com/;https://www.tedxiao.me;https://abalakrishna123.github.io/;https://suraj-nair-1.github.io/;https://rmrafailov.github.io/;;;https://quanvuong.github.io;http://tkollar.github.io;http://www.benburchfiel.com/;http://people.csail.mit.edu/russt;https://dorsa.fyi/;https://people.eecs.berkeley.edu/~svlevine/;https://cs.stanford.edu/~pliang/;https://ai.stanford.edu/~cbfinn/", "dblp": ";211/7137;199/1922;198/0598;218/5246.html;;272/5358;;;;10/6653;136/9247;73/1296;117/3174;80/7594;04/1701;131/1783", "google_scholar": "ZKRs0oEAAAAJ;https://scholar.google.com/citations?view_op=list_works;L5v2PHAAAAAJ;;tfN6V84AAAAJ;EHSuFcwAAAAJ;TwABcRgAAAAJ;;GuU6oA4AAAAJ;NSWI3OwAAAAJ;AEKT17QAAAAJ;eGoTK1YAAAAJ;nxNkEiYAAAAJ;ZaJEZpYAAAAJ;8R35rCwAAAAJ;pouyVyUAAAAJ;vfPE6hgAAAAJ", "orcid": ";;;;;;;;;;0000-0003-2598-8118;;;;;;", "linkedin": "moojink/;;;;ashwin-balakrishna-9b71a357/;;;ethan-paul-foster/;;;;benburchfiel/;;;;;", "or_profile": "~Moo_Jin_Kim1;~Karl_Pertsch1;~Siddharth_Karamcheti1;~Ted_Xiao1;~Ashwin_Balakrishna1;~Suraj_Nair1;~Rafael_Rafailov1;~Ethan_P_Foster1;~Pannag_R_Sanketi1;~Quan_Vuong2;~Thomas_Kollar1;~Benjamin_Burchfiel1;~Russ_Tedrake1;~Dorsa_Sadigh1;~Sergey_Levine1;~Percy_Liang1;~Chelsea_Finn1", "aff": "Stanford University;Stanford University;Stanford University;;Toyota Research Institute;Toyota Research Institute;Stanford University;Stanford University;Google;physical intelligence;Toyota Research Institute;Dexterous Manipulation Group, Toyota Research Institute;Massachusetts Institute of Technology;Stanford University;Google;Stanford University;Google", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;;tri.global;tri.global;stanford.edu;stanford.edu;google.com;physicalintelligence.company;tri.global;tri.global;mit.edu;stanford.edu;google.com;stanford.edu;google.com", "position": "PhD student;Postdoc;PhD student;;Researcher;Researcher;PhD student;Undergrad student;Researcher;Researcher;Principal Researcher;Researcher;Full Professor;Assistant Professor;Research Scientist;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nkim2024openvla,\ntitle={Open{VLA}: An Open-Source Vision-Language-Action Model},\nauthor={Moo Jin Kim and Karl Pertsch and Siddharth Karamcheti and Ted Xiao and Ashwin Balakrishna and Suraj Nair and Rafael Rafailov and Ethan P Foster and Pannag R Sanketi and Quan Vuong and Thomas Kollar and Benjamin Burchfiel and Russ Tedrake and Dorsa Sadigh and Sergey Levine and Percy Liang and Chelsea Finn},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZMnD6QZAE6}\n}", "github": "https://github.com/openvla/openvla", "project": "", "reviewers": "LnaQ;t3E9;GEDn", "site": "https://openreview.net/forum?id=ZMnD6QZAE6", "pdf_size": 0, "rating": "4;4;4", "confidence": "3;4;4", "rating_avg": 4.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 17, "corr_rating_confidence": 0.0, "gs_citation": 437, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14110841752433820937&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;1;0;0;2;3;1;1;4;0;2;0;2", "aff_unique_norm": "Stanford University;Toyota Research Institute;Google;Physical Intelligence;Massachusetts Institute of Technology", "aff_unique_dep": ";;Google;;", "aff_unique_url": "https://www.stanford.edu;https://www.tri.global;https://www.google.com;;https://web.mit.edu", "aff_unique_abbr": "Stanford;TRI;Google;;MIT", "aff_campus_unique_index": "0;0;0;0;0;2;0;2;0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "ZdgaF8fOc0", "title": "Bridging the gap between Learning-to-plan, Motion Primitives and Safe Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Trajectory planning under kinodynamic constraints is fundamental for advanced robotics applications that require dexterous, reactive, and rapid skills in complex environments. These constraints, which may represent task, safety, or actuator limitations, are essential for ensuring the proper functioning of robotic platforms and preventing unexpected behaviors. Recent advances in kinodynamic planning demonstrate that learning-to-plan techniques can generate complex and reactive motions under intricate constraints. However, these techniques necessitate the analytical modeling of both the robot and the entire task, a limiting assumption when systems are extremely complex or when constructing accurate task models is prohibitive.\nThis paper addresses this limitation by combining learning-to-plan methods with reinforcement learning, resulting in a novel integration of black-box learning of motion primitives and optimization. We evaluate our approach against state-of-the-art safe reinforcement learning methods, showing that our technique, particularly when exploiting task structure, outperforms baseline methods in challenging scenarios such as planning to hit in robot air hockey. This work demonstrates the potential of our integrated approach to enhance the performance and safety of robots operating under complex kinodynamic constraints.", "keywords": "safe reinforcement learning;motion planning;motion primitives", "primary_area": "", "supplementary_material": "/attachment/3cb8db246c064dc081b012545bc89af765ce5854.zip", "author": "Piotr Kicki;Davide Tateo;Puze Liu;Jonas G\u00fcnster;Jan Peters;Krzysztof Walas", "authorids": "~Piotr_Kicki1;~Davide_Tateo2;~Puze_Liu1;~Jonas_G\u00fcnster1;~Jan_Peters3;~Krzysztof_Walas2", "gender": "M;M;M;M;M;M", "homepage": ";https://www.ias.informatik.tu-darmstadt.de/Team/DavideTateo;https://puzeliu.github.io/;;https://www.jan-peters.net;https://ideas-ncbr.pl/en/osoby/krzysztof-walas/", "dblp": "234/2595;214/0808;292/4069;;p/JanPeters1;05/9858.html", "google_scholar": "tilnVjMAAAAJ;https://scholar.google.it/citations?user=LGnu3SEAAAAJ;zg-FMloAAAAJ;;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;0FZ0cZQAAAAJ", "orcid": ";0000-0002-7193-923X;0000-0001-6887-7704;;0000-0002-5266-8091;0000-0002-2800-2716", "linkedin": ";;;jonas-g%C3%BCnster-6b49a2186/;janrpeters/;krzysztof-walas-850492a7/", "or_profile": "~Piotr_Kicki1;~Davide_Tateo2;~Puze_Liu1;~Jonas_G\u00fcnster1;~Jan_Peters3;~Krzysztof_Walas2", "aff": "IDEAS NCBR Sp.;Technische Universit\u00e4t Darmstadt;TU Darmstadt;Technische Universit\u00e4t Darmstadt;TU Darmstadt;Technical University of Poznan", "aff_domain": "ideas-ncbr.pl;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de;put.poznan.pl", "position": "Postdoc;Researcher;PhD student;MS student;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nkicki2024bridging,\ntitle={Bridging the gap between Learning-to-plan, Motion Primitives and Safe Reinforcement Learning},\nauthor={Piotr Kicki and Davide Tateo and Puze Liu and Jonas G{\\\"u}nster and Jan Peters and Krzysztof Walas},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ZdgaF8fOc0}\n}", "github": "https://github.com/pkicki/spline_rl/", "project": "", "reviewers": "gN6n;j8eQ;pgZh", "site": "https://openreview.net/forum?id=ZdgaF8fOc0", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;3;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10274044461515364233&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;1;1;2", "aff_unique_norm": "IDEAS NCBR;Technische Universit\u00e4t Darmstadt;Technical University of Poznan", "aff_unique_dep": ";;", "aff_unique_url": ";https://www.tu-darmstadt.de;https://www.put.poznan.pl/", "aff_unique_abbr": ";TUD;PUT", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Darmstadt", "aff_country_unique_index": "0;1;1;1;1;0", "aff_country_unique": "Poland;Germany" }, { "id": "aaY5fVFMVf", "title": "Conformal Prediction for Semantically-Aware Autonomous Perception in Urban Environments", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce Knowledge-Refined Prediction Sets (KRPS), a novel approach that performs semantically-aware uncertainty quantification for multitask-based autonomous perception in urban environments. KRPS extends conformal prediction (CP) to ensure 2 properties not typically addressed by CP frameworks: semantic label consistency and true label coverage, across multiple perception tasks. We elucidate the capability of KRPS through high-level classification tasks crucial for semantically-aware autonomous perception in urban environments, including agent classification, agent location classification, and agent action classification. In a theoretical analysis, we introduce the concept of semantic label consistency among tasks and prove the semantic consistency and marginal coverage properties of the produced sets by KRPS. The results of our evaluation on the ROAD dataset and the Waymo/ROAD++ dataset show that KRPS outperforms state-of-the-art CP methods in reducing uncertainty by up to 80\\% and increasing the semantic consistency by up to 30\\%, while maintaining the coverage guarantees.", "keywords": "Uncertainty in Robotics;Robot Perception;Semantics for Robotics", "primary_area": "", "supplementary_material": "/attachment/0691d8ad64adc804ba0004338636f671a9c6695d.zip", "author": "Achref Doula;Tobias G\u00fcdelh\u00f6fer;Max M\u00fchlh\u00e4user;Alejandro Sanchez Guinea", "authorids": "~Achref_Doula1;~Tobias_G\u00fcdelh\u00f6fer1;~Max_M\u00fchlh\u00e4user1;~Alejandro_Sanchez_Guinea1", "gender": ";;M;M", "homepage": ";;https://www.informatik.tu-darmstadt.de/telekooperation/telecooperation_group/staff_1/staff_1_details_23168.en.jsp;https://www.informatik.tu-darmstadt.de/telekooperation/staff_tk/tk_staff_details_77248.en.jsp", "dblp": ";325/3058;m/MaxMuhlhauser;132/8925", "google_scholar": ";https://scholar.google.com/citations?hl=en;https://scholar.google.com.tw/citations?user=69rd-UAAAAAJ;qrpXZqQAAAAJ", "orcid": ";;0000-0003-4713-5327;", "linkedin": ";;;", "or_profile": "~Achref_Doula1;~Tobias_G\u00fcdelh\u00f6fer1;~Max_M\u00fchlh\u00e4user1;~Alejandro_Sanchez_Guinea1", "aff": ";Technische Universit\u00e4t Darmstadt;Technische Universit\u00e4t Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": ";tu-darmstadt.de;tu-darmstadt.de;tu-darmstadt.de", "position": ";MS student;Full Professor;Postdoc", "bibtex": "@inproceedings{\ndoula2024conformal,\ntitle={Conformal Prediction for Semantically-Aware Autonomous Perception in Urban Environments},\nauthor={Achref Doula and Tobias G{\\\"u}delh{\\\"o}fer and Max M{\\\"u}hlh{\\\"a}user and Alejandro Sanchez Guinea},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=aaY5fVFMVf}\n}", "github": "https://gitlab.com/achref.d/krps", "project": "", "reviewers": "Tzck;UhpA;8pRV;gYc2", "site": "https://openreview.net/forum?id=aaY5fVFMVf", "pdf_size": 0, "rating": "1;3;3;3", "confidence": "3;4;3;5", "rating_avg": 2.5, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:dFaUUEOpG6AJ:scholar.google.com/&scioq=Conformal+Prediction+for+Semantically-Aware+Autonomous+Perception+in+Urban+Environments&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;0", "aff_unique_norm": "Technische Universit\u00e4t Darmstadt", "aff_unique_dep": "", "aff_unique_url": "https://www.tu-darmstadt.de", "aff_unique_abbr": "TUD", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Germany" }, { "id": "adf3pO9baG", "title": "Dreaming to Assist: Learning to Align with Human Objectives for Shared Control in High-Speed Racing", "track": "main", "status": "Poster", "tldr": "", "abstract": "Tight coordination is required for effective human-robot teams in domains involving fast dynamics and tactical decisions, such as multi-car racing. In such settings, robot teammates must react to cues of a human teammate's tactical objective to assist in a way that is consistent with the objective (e.g., navigating left or right around an obstacle). To address this challenge, we present _Dream2Assist_, a framework that combines a rich world model able to infer human objectives and value functions, and an assistive agent that provides appropriate expert assistance to a given human teammate. Our approach builds on a recurrent state space model to explicitly infer human intents, enabling the assistive agent to select actions that align with the human and enabling a fluid teaming interaction. We demonstrate our approach in a high-speed racing domain with a population of synthetic human drivers pursuing mutually exclusive objectives, such as \"stay-behind\" and \"overtake\". We show that the combined human-robot team, when blending its actions with those of the human, outperforms synthetic humans alone and several baseline assistance strategies, and that intent-conditioning enables adherence to human preferences during task execution, leading to improved performance while satisfying the human's objective.", "keywords": "Recurrent State-Space Models;Human-Robot Interactions;Shared-Control", "primary_area": "", "supplementary_material": "/attachment/cbc872db82e387580cb7f22c6dc82f452fb88d5c.zip", "author": "Jonathan DeCastro;Andrew Silva;Deepak Gopinath;Emily Sumner;Thomas Matrai Balch;Laporsha Dees;Guy Rosman", "authorids": "~Jonathan_DeCastro1;~Andrew_Silva1;deepak.gopinath@tri.global;emily.sumner@tri.global;~Thomas_Matrai_Balch1;laporsha.dees.ctr@tri.global;~Guy_Rosman2", "gender": "M;M;;;M;;M", "homepage": "http://jadecastro.github.io/;https://www.andrew-silva.com/;;;;;http://people.csail.mit.edu/rosman/index.html", "dblp": "139/3569;58/2908;;;;;53/3441", "google_scholar": "Pnbjx1AAAAAJ;6wuXl_cAAAAJ;;;;;https://scholar.google.com/citations?hl=en", "orcid": "0000-0002-0933-9671;0000-0002-0317-5135;;;;;", "linkedin": "jonathan-decastro-b8748715/;andrew-silva-6a29026a/;;;thomas-balch-53443a43/;;", "or_profile": "~Jonathan_DeCastro1;~Andrew_Silva1;deepak.gopinath@tri.global;emily.sumner@tri.global;~Thomas_Matrai_Balch1;laporsha.dees.ctr@tri.global;~Guy_Rosman2", "aff": "Toyota Research Institute;Toyota Research Institute;;;Toyota Research Institute;;Toyota Research Institute", "aff_domain": "tri.global;tri.global;;;tri.global;;tri.global", "position": "Researcher;Researcher;;;Researcher;;Researcher", "bibtex": "@inproceedings{\ndecastro2024dreaming,\ntitle={Dreaming to Assist: Learning to Align with Human Objectives for Shared Control in High-Speed Racing},\nauthor={Jonathan DeCastro and Andrew Silva and Deepak Gopinath and Emily Sumner and Thomas Matrai Balch and Laporsha Dees and Guy Rosman},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=adf3pO9baG}\n}", "github": "", "project": "", "reviewers": "TZys;5oAj;jTCE", "site": "https://openreview.net/forum?id=adf3pO9baG", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;5;2", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.18898223650461365, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9010740779914947409&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Toyota Research Institute", "aff_unique_dep": "", "aff_unique_url": "https://www.tri.global", "aff_unique_abbr": "TRI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "bftFwjSJxk", "title": "Rate-Informed Discovery via Bayesian Adaptive Multifidelity Sampling", "track": "main", "status": "Poster", "tldr": "", "abstract": "Ensuring the safety of autonomous vehicles (AVs) requires both accurate estimation of their performance and efficient discovery of potential failure cases. This paper introduces Bayesian adaptive multifidelity sampling (BAMS), which leverages the power of adaptive Bayesian sampling to achieve efficient discovery while simultaneously estimating the rate of adverse events. BAMS prioritizes exploration of regions with potentially low performance, leading to the identification of novel and critical scenarios that traditional methods might miss. Using real-world AV data we demonstrate that BAMS discovers 10 times as many issues as Monte Carlo (MC) and importance sampling (IS) baselines, while at the same time generating rate estimates with variances 15 and 6 times narrower than MC and IS baselines respectively.", "keywords": "Autonomous Driving;Rare-event Simulation;Adaptive Sampling", "primary_area": "", "supplementary_material": "/attachment/cc112e67be112c6cbd5028360608710fd491f4fb.zip", "author": "Aman Sinha;Payam Nikdel;Supratik Paul;Shimon Whiteson", "authorids": "~Aman_Sinha1;~Payam_Nikdel1;~Supratik_Paul1;~Shimon_Whiteson1", "gender": "M;M;M;", "homepage": "https://amansinha.com;https://www.linkedin.com/in/pnikdel/;;", "dblp": ";;180/5515;https://dblp.uni-trier.de/pers/w/Whiteson:Shimon.html", "google_scholar": "ZCa4VDcAAAAJ;JZBqZzkAAAAJ;;", "orcid": ";;;", "linkedin": "amans1nha;pnikdel/;;", "or_profile": "~Aman_Sinha1;~Payam_Nikdel1;~Supratik_Paul1;~Shimon_Whiteson1", "aff": "Princeton University;Waymo;Waymo;University of Oxford", "aff_domain": "princeton.edu;google.com;waymo.com;ox.ac.uk", "position": "Undergrad student;Researcher;Researcher;Professor", "bibtex": "@inproceedings{\nsinha2024rateinformed,\ntitle={Rate-Informed Discovery via Bayesian Adaptive Multifidelity Sampling},\nauthor={Aman Sinha and Payam Nikdel and Supratik Paul and Shimon Whiteson},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bftFwjSJxk}\n}", "github": "", "project": "", "reviewers": "GzcW;EQTe", "site": "https://openreview.net/forum?id=bftFwjSJxk", "pdf_size": 0, "rating": "3;3", "confidence": "4;3", "rating_avg": 3.0, "confidence_avg": 3.5, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:LzJX3vB9WQYJ:scholar.google.com/&scioq=Rate-Informed+Discovery+via+Bayesian+Adaptive+Multifidelity+Sampling&hl=en&as_sdt=0,33", "gs_version_total": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "Princeton University;Waymo;University of Oxford", "aff_unique_dep": ";;", "aff_unique_url": "https://www.princeton.edu;https://www.waymo.com;https://www.ox.ac.uk", "aff_unique_abbr": "Princeton;Waymo;Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;United Kingdom" }, { "id": "bk28WlkqZn", "title": "3D-ViTac: Learning Fine-Grained Manipulation with Visuo-Tactile Sensing", "track": "main", "status": "Poster", "tldr": "", "abstract": "Tactile and visual perception are both crucial for humans to perform fine-grained interactions with their environment. Developing similar multi-modal sensing capabilities for robots can significantly enhance and expand their manipulation skills. This paper introduces **3D-ViTac**, a multi-modal sensing and learning system designed for dexterous bimanual manipulation. Our system features tactile sensors equipped with dense sensing units, each covering an area of 3$mm^2$. These sensors are low-cost and flexible, providing detailed and extensive coverage of physical contacts, effectively complementing visual information. To integrate tactile and visual data, we fuse them into a unified 3D representation space that preserves their 3D structures and spatial relationships. The multi-modal representation can then be coupled with diffusion policies for imitation learning. Through concrete hardware experiments, we demonstrate that even low-cost robots can perform precise manipulations and significantly outperform vision-only policies, particularly in safe interactions with fragile items and executing long-horizon tasks involving in-hand manipulation. Our project page is available at https://binghao-huang.github.io/3D-ViTac/.", "keywords": "Contact-Rich Manipulation;Multi-Modal Perception;Tactile Sensing;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/202df8734ce55e32687b9a98852e10bd8190a209.zip", "author": "Binghao Huang;Yixuan Wang;Xinyi Yang;Yiyue Luo;Yunzhu Li", "authorids": "~Binghao_Huang1;~Yixuan_Wang2;~Xinyi_Yang6;~Yiyue_Luo1;~Yunzhu_Li1", "gender": ";M;F;F;M", "homepage": "https://binghao-huang.github.io/;https://wangyixuan12.github.io/;;https://yyueluo.com/;https://yunzhuli.github.io/", "dblp": ";44/4317-3;;;182/1831", "google_scholar": "nqoOetAAAAAJ;https://scholar.google.com/citations?hl=en;;;WlA92lcAAAAJ", "orcid": ";0009-0006-6641-4718;0009-0005-0610-100X;;", "linkedin": ";yixuan-wang-54298115a;;;", "or_profile": "~Binghao_Huang1;~Yixuan_Wang2;~Xinyi_Yang6;~Yiyue_Luo1;~Yunzhu_Li1", "aff": "University of Illinois Urbana-Champaign;University of Illinois, Urbana Champaign;Zhejiang University;Computer Science and Artificial Intelligence Laboratory, Electrical Engineering & Computer Science;University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu;intl.zju.edu.cn;csail.mit.edu;illinois.edu", "position": "PhD student;MS student;Undergrad student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024dvitac,\ntitle={3D-ViTac: Learning Fine-Grained Manipulation with Visuo-Tactile Sensing},\nauthor={Binghao Huang and Yixuan Wang and Xinyi Yang and Yiyue Luo and Yunzhu Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bk28WlkqZn}\n}", "github": "", "project": "", "reviewers": "AvTY;qtTt;nMTy", "site": "https://openreview.net/forum?id=bk28WlkqZn", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13755702249158399321&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;0", "aff_unique_norm": "University of Illinois Urbana-Champaign;Zhejiang University;Massachusetts Institute of Technology", "aff_unique_dep": ";;Computer Science and Artificial Intelligence Laboratory", "aff_unique_url": "https://illinois.edu;https://www.zju.edu.cn;https://www.csail.mit.edu", "aff_unique_abbr": "UIUC;ZJU;CSAIL", "aff_campus_unique_index": "0;0;2;0", "aff_campus_unique": "Urbana-Champaign;;Cambridge", "aff_country_unique_index": "0;0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "bt0PX0e4rE", "title": "Bootstrapping Reinforcement Learning with Imitation for Vision-Based Agile Flight", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning visuomotor policies for agile quadrotor flight presents significant difficulties, primarily from inefficient policy exploration caused by high-dimensional visual inputs and the need for precise and low-latency control.\nTo address these challenges, we propose a novel approach that combines the performance of Reinforcement Learning (RL) and the sample efficiency of Imitation Learning (IL) in the task of vision-based autonomous drone racing.\nWhile RL provides a framework for learning high-performance controllers through trial and error, it faces challenges with sample efficiency and computational demands due to the high dimensionality of visual inputs.\nConversely, IL efficiently learns from visual expert demonstrations, but it remains limited by the expert's performance and state distribution.\nTo overcome these limitations, our policy learning framework integrates the strengths of both approaches.\nOur framework contains three phases: training a teacher policy using RL with privileged state information, distilling it into a student policy via IL, and adaptive fine-tuning via RL.\nTesting in both simulated and real-world scenarios shows our approach can not only learn in scenarios where RL from scratch fails but also outperforms existing IL methods in both robustness and performance, successfully navigating a quadrotor through a race course using only visual information.", "keywords": "Quadrotor;Visuomotor Control;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/bc8a218b8942509581912fd9e86da361f6b1b7f6.zip", "author": "Jiaxu Xing;Angel Romero;Leonard Bauersfeld;Davide Scaramuzza", "authorids": "~Jiaxu_Xing1;~Angel_Romero1;~Leonard_Bauersfeld1;~Davide_Scaramuzza1", "gender": "M;;M;", "homepage": "https://www.linkedin.com/in/jiaxu-xing-78a23419a/;;https://lbfd.github.io/;", "dblp": "251/4113;;;", "google_scholar": "lo4jHxcAAAAJ;;vj9RyxIAAAAJ;", "orcid": ";;0000-0002-5790-9982;", "linkedin": ";;leonard-bauersfeld-368992150/;", "or_profile": "~Jiaxu_Xing1;~Angel_Romero1;~Leonard_Bauersfeld1;~Davide_Scaramuzza1", "aff": "Department of Informatics, University of Zurich, University of Zurich;;University of Zurich;", "aff_domain": "ifi.uzh.ch;;uzh.ch;", "position": "PhD student;;PhD student;", "bibtex": "@inproceedings{\nxing2024bootstrapping,\ntitle={Bootstrapping Reinforcement Learning with Imitation for Vision-Based Agile Flight},\nauthor={Jiaxu Xing and Angel Romero and Leonard Bauersfeld and Davide Scaramuzza},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=bt0PX0e4rE}\n}", "github": "", "project": "", "reviewers": "8K8j;e7LB;tjv6", "site": "https://openreview.net/forum?id=bt0PX0e4rE", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 19, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9345514975960624863&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 7, "aff_unique_index": "0;0", "aff_unique_norm": "University of Zurich", "aff_unique_dep": "Department of Informatics", "aff_unique_url": "https://www.uzh.ch", "aff_unique_abbr": "UZH", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Switzerland" }, { "id": "cDXnnOhNrF", "title": "Perceive With Confidence: Statistical Safety Assurances for Navigation with Learning-Based Perception", "track": "main", "status": "Poster", "tldr": "", "abstract": "Rapid advances in perception have enabled large pre-trained models to be used out of the box for transforming high-dimensional, noisy, and partial observations of the world into rich occupancy representations. However, the reliability of these models and consequently their safe integration onto robots remains unknown when deployed in environments unseen during training. In this work, we address this challenge by rigorously quantifying the uncertainty of pre-trained perception systems for object detection via a novel calibration technique based on conformal prediction. Crucially, this procedure guarantees robustness to distribution shifts in states when perceptual outputs are used in conjunction with a planner. As a result, the calibrated perception system can be used in combination with any safe planner to provide an end-to-end statistical assurance on safety in unseen environments. We evaluate the resulting approach, Perceive with Confidence (PwC), with experiments in simulation and on hardware where a quadruped robot navigates through previously unseen indoor, static environments. These experiments validate the safety assurances for obstacle avoidance provided by PwC and demonstrate up to 40% improvements in empirical safety compared to baselines.", "keywords": "Uncertainty quantification;occupancy prediction;robot navigation", "primary_area": "", "supplementary_material": "/attachment/f81f91f0bc68c715733e1648a95d64c5b2d175c7.zip", "author": "Anushri Dixit;Zhiting Mei;Meghan Booker;Mariko Storey-Matsutani;Allen Z. Ren;Anirudha Majumdar", "authorids": "~Anushri_Dixit1;~Zhiting_Mei1;~Meghan_Booker1;ms8364@princeton.edu;~Allen_Z._Ren1;~Anirudha_Majumdar1", "gender": "F;F;;;M;M", "homepage": "https://www.anushridixit.com/;https://may0mei.github.io/;https://megbooker.com;;http://allenzren.github.io/;https://irom-lab.princeton.edu/majumdar/", "dblp": ";;;;;116/6436", "google_scholar": "ADThnCAAAAAJ;aWwlcNsAAAAJ;;;mgMzkYMAAAAJ;ibu3FwsAAAAJ", "orcid": ";;;;;", "linkedin": ";;;;allenzren/;", "or_profile": "~Anushri_Dixit1;~Zhiting_Mei1;~Meghan_Booker1;ms8364@princeton.edu;~Allen_Z._Ren1;~Anirudha_Majumdar1", "aff": "Princeton University;Princeton University;Princeton University;;Google DeepMind;Princeton University", "aff_domain": "princeton.edu;princeton.edu;princeton.edu;;google.com;princeton.edu", "position": "Postdoc;PhD student;PhD student;;Intern;Associate Professor", "bibtex": "@inproceedings{\ndixit2024perceive,\ntitle={Perceive With Confidence: Statistical Safety Assurances for Navigation with Learning-Based Perception},\nauthor={Anushri Dixit and Zhiting Mei and Meghan Booker and Mariko Storey-Matsutani and Allen Z. Ren and Anirudha Majumdar},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cDXnnOhNrF}\n}", "github": "https://github.com/irom-lab/perception-guarantees", "project": "", "reviewers": "UaeJ;B6rP;a8KQ", "site": "https://openreview.net/forum?id=cDXnnOhNrF", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12276222982479190560&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Princeton University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.princeton.edu;https://deepmind.com", "aff_unique_abbr": "Princeton;DeepMind", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "cGswIOxHcN", "title": "Learning Visual Parkour from Generated Images", "track": "main", "status": "Poster", "tldr": "", "abstract": "Fast and accurate physics simulation is an essential component of robot learning, where robots can explore failure scenarios that are difficult to produce in the real world and learn from unlimited on-policy data. Yet, it remains challenging to incorporate RGB-color perception into the sim-to-real pipeline that matches the real world in its richness and realism. In this work, we train a robot dog in simulation for visual parkour. We propose a way to use generative models to synthesize diverse and physically accurate image sequences of the scene from the robot's ego-centric perspective. We present demonstrations of zero-shot transfer to the RGB-only observations of the real world on a robot equipped with a low-cost, off-the-shelf color camera.", "keywords": "Generative AI;Simulation;Legged Locomotion;Sensory Motor-learning", "primary_area": "", "supplementary_material": "/attachment/afec99dcc2e2b282b42906972b9cce6165911844.zip", "author": "Alan Yu;Ge Yang;Ran Choi;Yajvan Ravan;John Leonard;Phillip Isola", "authorids": "~Alan_Yu2;~Ge_Yang1;~Ran_Choi1;~Yajvan_Ravan1;~John_Leonard1;~Phillip_Isola1", "gender": "M;M;F;;M;M", "homepage": "https://alany1.github.io;http://www.episodeyang.com;;https://www.linkedin.com/in/yajvan-ravan/;http://marinerobotics.mit.edu;http://web.mit.edu/phillipi/", "dblp": ";48/4561-3;;;58/1487;36/9988", "google_scholar": "https://scholar.google.com/citations?hl=en;vaQcF6kAAAAJ;;;WPe7vWwAAAAJ;ROILf3EAAAAJ", "orcid": ";0000-0001-7520-7055;;;0000-0002-8863-6550;0000-0002-1411-6704", "linkedin": ";;ran-choi-tmr;yajvan-ravan/;john-leonard-9381393b/;phillip-isola-a9955b20/", "or_profile": "~Alan_Yu2;~Ge_Yang1;~Ran_Choi1;~Yajvan_Ravan1;~John_Leonard1;~Phillip_Isola1", "aff": "Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;mit.edu;mit.edu;mit.edu;mit.edu;mit.edu", "position": "Undergrad student;Postdoc;Postdoc;Undergrad student;Full Professor;Associate Professor", "bibtex": "@inproceedings{\nyu2024learning,\ntitle={Learning Visual Parkour from Generated Images},\nauthor={Alan Yu and Ge Yang and Ran Choi and Yajvan Ravan and John Leonard and Phillip Isola},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cGswIOxHcN}\n}", "github": "", "project": "", "reviewers": "n6VN;di9g;xbt2", "site": "https://openreview.net/forum?id=cGswIOxHcN", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;3", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17001155642040563710&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology", "aff_unique_dep": "", "aff_unique_url": "https://web.mit.edu", "aff_unique_abbr": "MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "cNI0ZkK1yC", "title": "Flow as the Cross-domain Manipulation Interface", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present Im2Flow2Act, a scalable learning framework that enables robots to acquire real-world manipulation skills without the need of real-world robot training data. The key idea behind Im2Flow2Act is to use object flow as the manipulation interface, bridging domain gaps between different embodiments (i.e., human and robot) and training environments (i.e., real-world and simulated). Im2Flow2Act comprises two components: a flow generation network and a flow-conditioned policy. The flow generation network, trained on human demonstration videos, generates object flow from the initial scene image, conditioned on the task description. The flow-conditioned policy, trained on simulated robot play data, maps the generated object flow to robot actions to realize the desired object movements. By using flow as input, this policy can be directly deployed in the real world with a minimal sim-to-real gap. By leveraging real-world human videos and simulated robot play data, we bypass the challenges of teleoperating physical robots in the real world, resulting in a scalable system for diverse tasks. We demonstrate Im2Flow2Act's capabilities in a variety of real-world tasks, including the manipulation of rigid, articulated, and deformable objects.", "keywords": "Robots;Learning;cross-domain;cross-embodiment", "primary_area": "", "supplementary_material": "/attachment/43ac432add5b5f104f63de2dd96472ef1bd2c5a1.zip", "author": "Mengda Xu;Zhenjia Xu;Yinghao Xu;Cheng Chi;Gordon Wetzstein;Manuela Veloso;Shuran Song", "authorids": "~Mengda_Xu1;~Zhenjia_Xu1;~Yinghao_Xu1;~Cheng_Chi4;~Gordon_Wetzstein3;~Manuela_Veloso1;~Shuran_Song3", "gender": "M;M;M;M;M;F;F", "homepage": "https://mengdaxu.github.io/;https://www.zhenjiaxu.com/;https://justimyhxu.github.io/;https://cheng-chi.github.io/;http://web.stanford.edu/~gordonwz/;https://www.cs.cmu.edu/~mmv/;https://shurans.github.io/", "dblp": ";238/0000;232/2482;;13/4660;v/ManuelaMVeloso;", "google_scholar": "https://scholar.google.com/citations?hl=en;QE8cLMEAAAAJ;https://scholar.google.com/citations?hl=en;EO0PHdAAAAAJ;VOf45S0AAAAJ;https://scholar.google.com.tw/citations?user=2FbkAzYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;0000-0002-9243-6885;;", "linkedin": "mengda-xu-132b57135/;;;;gordon-wetzstein-2406723/;;", "or_profile": "~Mengda_Xu1;~Zhenjia_Xu1;~Yinghao_Xu1;~Cheng_Chi4;~Gordon_Wetzstein3;~Manuela_Veloso1;~Shuran_Song3", "aff": "Columbia University;Columbia University;Stanford University;Stanford University;Stanford University;School of Computer Science, Carnegie Mellon University;Stanford University", "aff_domain": "columbia.edu;columbia.edu;stanford.edu;stanford.edu;stanford.edu;cs.cmu.edu;stanford.edu", "position": "PhD student;PhD student;Postdoc;PhD student;Associate Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nxu2024flow,\ntitle={Flow as the Cross-domain Manipulation Interface},\nauthor={Mengda Xu and Zhenjia Xu and Yinghao Xu and Cheng Chi and Gordon Wetzstein and Manuela Veloso and Shuran Song},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cNI0ZkK1yC}\n}", "github": "https://github.com/real-stanford/im2Flow2Act", "project": "", "reviewers": "aaJx;GbPT;WWt7", "site": "https://openreview.net/forum?id=cNI0ZkK1yC", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;5;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.5, "gs_citation": 48, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17959278197404251304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1;1;2;1", "aff_unique_norm": "Columbia University;Stanford University;Carnegie Mellon University", "aff_unique_dep": ";;School of Computer Science", "aff_unique_url": "https://www.columbia.edu;https://www.stanford.edu;https://www.cmu.edu", "aff_unique_abbr": "Columbia;Stanford;CMU", "aff_campus_unique_index": "1;1;1;2;1", "aff_campus_unique": ";Stanford;Pittsburgh", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "cT2N3p1AcE", "title": "Visual Whole-Body Control for Legged Loco-Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We study the problem of mobile manipulation using legged robots equipped with an arm, namely legged loco-manipulation. The robot legs, while usually utilized for mobility, offer an opportunity to amplify the manipulation capabilities by conducting whole-body control. That is, the robot can control the legs and the arm at the same time to extend its workspace. We propose a framework that can conduct the whole-body control autonomously with visual observations. Our approach, namely Visual Whole-Body Control (VBC), is composed of a low-level policy using all degrees of freedom to track the body velocities along with the end-effector position, and a high-level policy proposing the velocities and end-effector position based on visual inputs. We train both levels of policies in simulation and perform Sim2Real transfer for real robot deployment. We perform extensive experiments and show significant improvements over baselines in picking up diverse objects in different configurations (heights, locations, orientations) and environments.", "keywords": "Robot Learning; Reinforcement Learning; Imitation Learning; Mobile Loco-Manipulation", "primary_area": "", "supplementary_material": "/attachment/3f1a773bc8fb3ba3e1274599403c95f1a17b976e.zip", "author": "Minghuan Liu;Zixuan Chen;Xuxin Cheng;Yandong Ji;Ri-Zhao Qiu;Ruihan Yang;Xiaolong Wang", "authorids": "~Minghuan_Liu1;~Zixuan_Chen9;~Xuxin_Cheng2;~Yandong_Ji1;~Ri-Zhao_Qiu1;~Ruihan_Yang2;~Xiaolong_Wang3", "gender": "M;M;M;;Not Specified;M;M", "homepage": "http://minghuanliu.com;https://zixuan417.github.io;https://chengxuxin.github.io;https://yandongji.github.io;https://rogerqi.github.io/;http://rchalyang.github.io/;https://xiaolonw.github.io/", "dblp": "249/7554;;;271/8584;336/5470;;91/952-4", "google_scholar": ";;Z8vhOxYAAAAJ;LyHzJOMAAAAJ;uH0re54AAAAJ;b-o1o7cAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;rizhaoqiu/;;", "or_profile": "~Minghuan_Liu1;~Zixuan_Chen9;~Xuxin_Cheng2;~Yandong_Ji1;~Ri-Zhao_Qiu1;~Ruihan_Yang2;~Xiaolong_Wang3", "aff": "Shanghai Jiaotong University;Fudan University;University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego;University of California, San Diego", "aff_domain": "sjtu.edu.cn;fudan.edu.cn;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu;ucsd.edu", "position": "PhD student;Undergrad student;PhD student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024visual,\ntitle={Visual Whole-Body Control for Legged Loco-Manipulation},\nauthor={Minghuan Liu and Zixuan Chen and Xuxin Cheng and Yandong Ji and Ri-Zhao Qiu and Ruihan Yang and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cT2N3p1AcE}\n}", "github": "https://github.com/Ericonaldo/visual_wholebody", "project": "", "reviewers": "xf7y;KPpy;FL2x", "site": "https://openreview.net/forum?id=cT2N3p1AcE", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;5;4", "rating_avg": 4.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 47, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6110199937992041226&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;2;2;2;2", "aff_unique_norm": "Shanghai Jiao Tong University;Fudan University;University of California, San Diego", "aff_unique_dep": ";;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.fudan.edu.cn;https://www.ucsd.edu", "aff_unique_abbr": "SJTU;Fudan;UCSD", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";San Diego", "aff_country_unique_index": "0;0;1;1;1;1;1", "aff_country_unique": "China;United States" }, { "id": "clqzoCrulY", "title": "OrbitGrasp: SE(3)-Equivariant Grasp Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "While grasp detection is an important part of any robotic manipulation pipeline, reliable and accurate grasp detection in $\\\\mathrm{SE}(3)$ remains a research challenge. Many robotics applications in unstructured environments such as the home or warehouse would benefit a lot from better grasp performance. This paper proposes a novel framework for detecting $\\mathrm{SE}(3)$ grasp poses based on point cloud input. Our main contribution is to propose an $\\mathrm{SE}(3)$-equivariant model that maps each point in the cloud to a continuous grasp quality function over the 2-sphere $S^2$ using a spherical harmonic basis. Compared with reasoning about a finite set of samples, this formulation improves the accuracy and efficiency of our model when a large number of samples would otherwise be needed. In order to accomplish this, we propose a novel variation on EquiFormerV2 that leverages a UNet-style backbone to enlarge the number of points the model can handle. Our resulting method, which we name OrbitGrasp, significantly outperforms baselines in both simulation and physical experiments.", "keywords": "Grasp Detection;Equivariance;Symmetry;Grasp Learning", "primary_area": "", "supplementary_material": "/attachment/65890f068643ce22c87937ffedc93be018511d5b.zip", "author": "Boce Hu;Xupeng Zhu;Dian Wang;Zihao Dong;Haojie Huang;Chenghao Wang;Robin Walters;Robert Platt", "authorids": "~Boce_Hu1;~Xupeng_Zhu1;~Dian_Wang1;~Zihao_Dong2;~Haojie_Huang1;~Chenghao_Wang1;~Robin_Walters1;~Robert_Platt1", "gender": "M;M;M;M;M;M;M;", "homepage": "https://bocehu.github.io/;https://zxp-s-works.github.io/;https://pointw.github.io/;;https://haojhuang.github.io/;;http://www.robinwalters.com;http://www.ccs.neu.edu/home/rplatt/", "dblp": ";257/4426;191/1369-1;;144/2195;;258/3416;39/5434", "google_scholar": ";mwxz-8MAAAAJ;CckjtfQAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;fnprJmUAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;;0009-0009-4076-130X;;;;", "linkedin": ";xupengzhu-skunk;dianwang1007;;;https://www.linkedin.com/feed/;;", "or_profile": "~Boce_Hu1;~Xupeng_Zhu1;~Dian_Wang1;~Zihao_Dong2;~Haojie_Huang1;~Chenghao_Wang1;~Robin_Walters1;~Robert_Platt1", "aff": "Northeastern University;Northeastern University;Northeastern University;Northeastern University;Northeastern University;Northeastern University;Northeastern University ;Northeastern University", "aff_domain": "northeastern.edu;northeastern.edu;northeastern.edu;northeastern.edu;northeastern.edu;northeastern.edu;northeastern.edu;neu.edu", "position": "PhD student;PhD student;PhD student;PhD student;PhD student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nhu2024orbitgrasp,\ntitle={OrbitGrasp: {SE}(3)-Equivariant Grasp Learning},\nauthor={Boce Hu and Xupeng Zhu and Dian Wang and Zihao Dong and Haojie Huang and Chenghao Wang and Robin Walters and Robert Platt},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=clqzoCrulY}\n}", "github": "", "project": "", "reviewers": "Zbw2;k5Je;RMP5", "site": "https://openreview.net/forum?id=clqzoCrulY", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11056160926059218189&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0;0;0;0;0;0", "aff_unique_norm": "Northeastern University", "aff_unique_dep": "", "aff_unique_url": "https://www.northeastern.edu", "aff_unique_abbr": "NEU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "cocHfT7CEs", "title": "Generative Image as Action Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Image-generation diffusion models have been fine-tuned to unlock new capabilities such as image-editing and novel view synthesis. Can we similarly unlock image-generation models for visuomotor control? We present GENIMA, a behavior-cloning agent that fine-tunes Stable Diffusion to \u201cdraw joint-actions\u201d as targets on RGB images. These images are fed into a controller that maps the visual targets into a sequence of joint-positions. We study GENIMA on 25 RLBench and 9 real-world manipulation tasks. We find that, by lifting actions into image-space, internet pre-trained diffusion models can generate policies that outperform state- of-the-art visuomotor approaches, especially in robustness to scene perturbations and generalizing to novel objects. Our method is also competitive with 3D agents, despite lacking priors such as depth, keypoints, or motion-planners.", "keywords": "Diffusion Models;Image Generation;Behavior Cloning;Visuomotor", "primary_area": "", "supplementary_material": "/attachment/a60f26cb7651092ca7855076290bfec905a65d85.zip", "author": "Mohit Shridhar;Yat Long Lo;Stephen James", "authorids": "~Mohit_Shridhar1;~Yat_Long_Lo1;~Stephen_James1", "gender": "M;M;M", "homepage": "http://mohitshridhar.com/;https://richielo.github.io;https://stepjam.github.io/", "dblp": "203/8577.html;205/3209;163/5669", "google_scholar": "CrfsfFSiS0kC;qBM2A3kAAAAJ;OXtG-isAAAAJ", "orcid": "0000-0001-7382-763X;;", "linkedin": ";yat-long-richie-lo-922991100/;", "or_profile": "~Mohit_Shridhar1;~Yat_Long_Lo1;~Stephen_James1", "aff": "Dyson;Dyson Robot Learning Lab;Dyson", "aff_domain": "dyson.com;dyson.com;dyson.com", "position": "Researcher;Researcher;Principal Researcher", "bibtex": "@inproceedings{\nshridhar2024generative,\ntitle={Generative Image as Action Models},\nauthor={Mohit Shridhar and Yat Long Lo and Stephen James},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cocHfT7CEs}\n}", "github": "https://github.com/MohitShridhar/genima", "project": "", "reviewers": "oFJa;u2x6;LDdg", "site": "https://openreview.net/forum?id=cocHfT7CEs", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;5;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3676131982637875011&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Dyson", "aff_unique_dep": "", "aff_unique_url": "https://www.dyson.com", "aff_unique_abbr": "", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "cq2uB30uBM", "title": "Pre-emptive Action Revision by Environmental Feedback for Embodied Instruction Following Agents", "track": "main", "status": "Poster", "tldr": "", "abstract": "When we, humans, perform a task, we consider changes in environments such as objects' arrangement due to interactions with objects and other reasons; e.g., when we find a mug to clean, if it is already clean, we skip cleaning it. But even the state-of-the-art embodied agents often ignore changed environments when performing a task, leading to failure to complete the task, executing unnecessary actions, or fixing the mistake after it was made. Here, we propose Pre-emptive Action Revision by Environmental feeDback (PRED) that allows an embodied agent to revise their action in response to the perceived environmental status before it makes mistakes. We empirically validate PRED and observe that it outperforms the prior art on two challenging benchmarks in the virtual environment, TEACh and ALFRED, by noticeable margins in most metrics, including unseen success rates, with shorter execution time, implying an efficiently behaved agent. Furthermore, we demonstrate the effectiveness of the proposed method with real robot experiments.", "keywords": "Replanning;Environmental Feedback;Brain plasticity;Embodied AI", "primary_area": "", "supplementary_material": "/attachment/523872a0a61b6d0804815a396f157f02f023b2c8.zip", "author": "Jinyeon Kim;Cheolhong Min;Byeonghwi Kim;Jonghyun Choi", "authorids": "~Jinyeon_Kim1;~Cheolhong_Min1;~Byeonghwi_Kim1;~Jonghyun_Choi1", "gender": "F;M;M;M", "homepage": ";https://cheolhong0916.github.io;https://bhkim94.github.io/;https://ppolon.github.io/", "dblp": ";326/9799;280/2943;21/11103", "google_scholar": "https://scholar.google.co.kr/citations?user=8pkA2wIAAAAJ;;Sr9hbXYAAAAJ;uiGWnm4AAAAJ", "orcid": ";;0000-0003-3775-2778;0000-0002-7934-8434", "linkedin": ";cheolhong-min-7711b5324/;byeonghwi-kim-821909167;jonghyun-choi-459bb615/", "or_profile": "~Jinyeon_Kim1;~Cheolhong_Min1;~Byeonghwi_Kim1;~Jonghyun_Choi1", "aff": "Yonsei University;Yonsei University;Seoul National University;Yonsei University", "aff_domain": "yonsei.ac.kr;yonsei.ac.kr;snu.ac.kr;yonsei.ac.kr", "position": "MS student;MS student;PhD student;Associate Professor", "bibtex": "@inproceedings{\nkim2024preemptive,\ntitle={Pre-emptive Action Revision by Environmental Feedback for Embodied Instruction Following Agents},\nauthor={Jinyeon Kim and Cheolhong Min and Byeonghwi Kim and Jonghyun Choi},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cq2uB30uBM}\n}", "github": "https://github.com/snumprlab/pred", "project": "", "reviewers": "nukf;Wspw;M1LD", "site": "https://openreview.net/forum?id=cq2uB30uBM", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:pVmmXJBWOqYJ:scholar.google.com/&scioq=Pre-emptive+Action+Revision+by+Environmental+Feedback+for+Embodied+Instruction+Following+Agents&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Yonsei University;Seoul National University", "aff_unique_dep": ";", "aff_unique_url": "https://www.yonsei.ac.kr;https://www.snu.ac.kr", "aff_unique_abbr": "Yonsei;SNU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "South Korea" }, { "id": "ctzBccpolr", "title": "RoVi-Aug: Robot and Viewpoint Augmentation for Cross-Embodiment Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Scaling up robot learning requires large and diverse datasets, and how to efficiently reuse collected data and transfer policies to new embodiments remains an open question. Emerging research such as the Open-X Embodiment (OXE) project has shown promise in leveraging skills by combining datasets including different robots. However, imbalances in the distribution of robot types and camera angles in many datasets make policies prone to overfit. To mitigate this issue, we propose RoVi-Aug, which leverages state-of-the-art image-to-image generative models to augment robot data by synthesizing demonstrations with different robots and camera views. Through extensive physical experiments, we show that, by training on robot- and viewpoint-augmented data, RoVi-Aug can zero-shot deploy on an unseen robot with significantly different camera angles. Compared to test-time adaptation algorithms such as Mirage, RoVi-Aug requires no extra processing at test time, does not assume known camera angles, and allows policy fine-tuning. Moreover, by co-training on both the original and augmented robot datasets, RoVi-Aug can learn multi-robot and multi-task policies, enabling more efficient transfer between robots and skills and improving success rates by up to 30%.", "keywords": "Cross-Embodiment Learning;Viewpoint Robust;Data Augmentation", "primary_area": "", "supplementary_material": "/attachment/1e73036120daaaf7b91b335669d551bca4c1498b.zip", "author": "Lawrence Yunliang Chen;Chenfeng Xu;Karthik Dharmarajan;Richard Cheng;Kurt Keutzer;Masayoshi Tomizuka;Quan Vuong;Ken Goldberg", "authorids": "~Lawrence_Yunliang_Chen1;~Chenfeng_Xu1;~Karthik_Dharmarajan1;~Richard_Cheng1;~Kurt_Keutzer1;~Masayoshi_Tomizuka2;~Quan_Vuong2;~Ken_Goldberg1", "gender": "M;M;;;M;M;M;M", "homepage": "https://yunliangchen.github.io/;;;;https://people.eecs.berkeley.edu/~keutzer/;https://quanvuong.github.io;http://goldberg.berkeley.edu/;https://me.berkeley.edu/people/masayoshi-tomizuka/", "dblp": ";65/1881;;03/5484;k/KurtKeutzer.html;;g/KennethYGoldberg;10/4434", "google_scholar": ";RpqvaTUAAAAJ;;d_Fpj0oAAAAJ;ID9QePIAAAAJ;NSWI3OwAAAAJ;https://scholar.google.com.tw/citations?user=8fztli4AAAAJ;", "orcid": ";0000-0002-4941-6985;;;0000-0003-3868-8501;;0000-0001-6747-9499;", "linkedin": "lawrence-yunliang-chen/;;karthik-dharmarajan/;;kurtkeutzer/;;goldbergken/;", "or_profile": "~Lawrence_Yunliang_Chen1;~Chenfeng_Xu1;~Karthik_Dharmarajan1;~Richard_Cheng1;~Kurt_Keutzer1;~Quan_Vuong2;~Ken_Goldberg1;~Masayoshi_Tomizuka1", "aff": "University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;Toyota Research Institute;University of California, Berkeley;physical intelligence;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;berkeley.edu;eecs.berkeley.edu;tri.global;berkeley.edu;physicalintelligence.company;berkeley.edu;berkeley.edu", "position": "PhD student;PhD student;Undergrad student;Researcher;Full Professor;Researcher;Full Professor;Full Professor", "bibtex": "@inproceedings{\nchen2024roviaug,\ntitle={RoVi-Aug: Robot and Viewpoint Augmentation for Cross-Embodiment Robot Learning},\nauthor={Lawrence Yunliang Chen and Chenfeng Xu and Karthik Dharmarajan and Richard Cheng and Kurt Keutzer and Masayoshi Tomizuka and Quan Vuong and Ken Goldberg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ctzBccpolr}\n}", "github": "", "project": "", "reviewers": "scrS;XX5k;HKKU;bj68", "site": "https://openreview.net/forum?id=ctzBccpolr", "pdf_size": 0, "rating": "3;3;4;4", "confidence": "4;4;4;4", "rating_avg": 3.5, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 21, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=516525710168939382&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1;0;2;0;0", "aff_unique_norm": "University of California, Berkeley;Toyota Research Institute;Physical Intelligence", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.tri.global;", "aff_unique_abbr": "UC Berkeley;TRI;", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "cvAIaS6V2I", "title": "OPEN TEACH: A Versatile Teleoperation System for Robotic Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Open-sourced, user-friendly tools form the bedrock of scientific advancement across disciplines. The widespread adoption of data-driven learning has led to remarkable progress in multi-fingered dexterity, bimanual manipulation, and applications ranging from logistics to home robotics. However, existing data collection platforms are often proprietary, costly, or tailored to specific robotic morphologies. We present OPEN TEACH, a new teleoperation system leveraging VR headsets to immerse users in mixed reality for intuitive robot control. built on the affordable Meta Quest 3, which costs $500, OPEN TEACH enables real-time control of various robots, including multi-fingered hands, bimanual arms, and mobile manipulators, through an easy-to-use app. Using natural hand gestures and movements, users can manipulate robots at up to 90Hz with smooth visual feedback and interface widgets offering closeup environment views. We demonstrate the versatility of OPEN TEACH across 38 tasks on different robots. A comprehensive user study indicates significant improvement in teleoperation capability over the AnyTeleop framework. Further experiments exhibit that the collected data is compatible with policy learning on 10 dexterous and contact-rich manipulation tasks. Currently supporting Franka, xArm, Jaco, Allegro, and Hello Stretch platforms, OPEN TEACH is fully open-sourced to promote broader adoption. Videos are available at https://anon-open-teach.github.io/.", "keywords": "Teleoperation;Robot Learning;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/4ff5fb81021e3b49e1212edf692b8fa63c5a02b6.zip", "author": "Aadhithya Iyer;Zhuoran Peng;Yinlong Dai;Irmak Guzey;Siddhant Haldar;Soumith Chintala;Lerrel Pinto", "authorids": "~Aadhithya_Iyer1;~Zhuoran_Peng1;~Yinlong_Dai1;~Irmak_Guzey1;~Siddhant_Haldar1;~Soumith_Chintala1;~Lerrel_Pinto1", "gender": "M;M;M;F;M;;M", "homepage": "https://aadhithya14.github.io/;https://bobbypeng123.github.io/;;https://irmakguzey.github.io/;https://siddhanthaldar.github.io/;;https://www.lerrelpinto.com/", "dblp": ";;;;227/2282;http://dblp.uni-trier.de/pers/hd/c/Chintala:Soumith;168/8304", "google_scholar": ";O7sI_yoAAAAJ;;0FEl834AAAAJ;-h_bkRgAAAAJ;36ofBJgAAAAJ;pmVPj94AAAAJ", "orcid": ";;;;;;", "linkedin": ";;yinlong-dai-28aa35168/;;;;", "or_profile": "~Aadhithya_Iyer1;~Zhuoran_Peng1;~Yinlong_Dai1;~Irmak_Guzey1;~Siddhant_Haldar1;~Soumith_Chintala1;~Lerrel_Pinto1", "aff": "New York University;New York University;New York University;New York University;New York University;Meta Facebook;New York University", "aff_domain": "nyu.edu;nyu.edu;nyu.edu;nyu.edu;nyu.edu;fb.com;cs.nyu.edu", "position": "MS student;Undergrad student;Undergrad student;PhD student;PhD student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\niyer2024open,\ntitle={{OPEN} {TEACH}: A Versatile Teleoperation System for Robotic Manipulation},\nauthor={Aadhithya Iyer and Zhuoran Peng and Yinlong Dai and Irmak Guzey and Siddhant Haldar and Soumith Chintala and Lerrel Pinto},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cvAIaS6V2I}\n}", "github": "https://github.com/aadhithya14/Open-Teach", "project": "", "reviewers": "m2vB;BA4K;rYe8", "site": "https://openreview.net/forum?id=cvAIaS6V2I", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 56, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8678221066156945519&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;1;0", "aff_unique_norm": "New York University;Meta", "aff_unique_dep": ";Meta Platforms, Inc.", "aff_unique_url": "https://www.nyu.edu;https://meta.com", "aff_unique_abbr": "NYU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "cvUXoou8iz", "title": "SPIRE: Synergistic Planning, Imitation, and Reinforcement Learning for Long-Horizon Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robot learning has proven to be a general and effective technique for programming manipulators. Imitation learning is able to teach robots solely from human demonstrations but is bottlenecked by the capabilities of the demonstrations. Reinforcement learning uses exploration to discover better behaviors; however, the space of possible improvements can be too large to start from scratch. And for both techniques, the learning difficulty increases proportional to the length of the manipulation task. Accounting for this, we propose SPIRE, a system that first uses Task and Motion Planning (TAMP) to decompose tasks into smaller learning subproblems and second combines imitation and reinforcement learning to maximize their strengths. We develop novel strategies to train learning agents when deployed in the context of a planning system. We evaluate SPIRE on a suite of long-horizon and contact-rich robot manipulation problems. We find that SPIRE outperforms prior approaches that integrate imitation learning, reinforcement learning, and planning by 35% to 50% in average task performance, is 6 times more data efficient in the number of human demonstrations needed to train proficient agents, and learns to complete tasks nearly twice as efficiently. View https://sites.google.com/view/spire-corl-2024 for more details.", "keywords": "Reinforcement Learning;Manipulation Planning;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/aa4f183e3aef8b55057e81a5c3296dc7adef91c3.zip", "author": "Zihan Zhou;Animesh Garg;Dieter Fox;Caelan Reed Garrett;Ajay Mandlekar", "authorids": "~Zihan_Zhou1;~Animesh_Garg1;~Dieter_Fox1;~Caelan_Reed_Garrett1;~Ajay_Mandlekar1", "gender": "M;M;M;M;M", "homepage": ";http://animesh.garg.tech;https://homes.cs.washington.edu/~fox/;http://web.mit.edu/caelan/www/;https://ai.stanford.edu/~amandlek/", "dblp": "00/6525-2;123/5728;f/DieterFox;161/9727;https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay", "google_scholar": ";zp8V7ZMAAAAJ;DqXsbPAAAAAJ;KVUCqGwAAAAJ;MEz23joAAAAJ", "orcid": ";0000-0003-0482-4296;;0000-0002-6474-1276;", "linkedin": ";animeshgarg/;;caelan-garrett-85197977/;", "or_profile": "~Zihan_Zhou1;~Animesh_Garg1;~Dieter_Fox1;~Caelan_Reed_Garrett1;~Ajay_Mandlekar1", "aff": "Department of Computer Science, University of Toronto;NVIDIA;Department of Computer Science;NVIDIA;NVIDIA", "aff_domain": "cs.toronto.edu;nvidia.com;cs.washington.edu;nvidia.com;nvidia.com", "position": "PhD student;Researcher;Full Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nzhou2024spire,\ntitle={{SPIRE}: Synergistic Planning, Imitation, and Reinforcement Learning for Long-Horizon Manipulation},\nauthor={Zihan Zhou and Animesh Garg and Dieter Fox and Caelan Reed Garrett and Ajay Mandlekar},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cvUXoou8iz}\n}", "github": "", "project": "", "reviewers": "doj6;bC4r;TuDy", "site": "https://openreview.net/forum?id=cvUXoou8iz", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.5, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12387376568984115870&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1;1", "aff_unique_norm": "University of Toronto;NVIDIA;Unknown Institution", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.utoronto.ca;https://www.nvidia.com;", "aff_unique_abbr": "U of T;NVIDIA;", "aff_campus_unique_index": "0", "aff_campus_unique": "Toronto;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "Canada;United States;" }, { "id": "cvVEkS5yij", "title": "Meta-Control: Automatic Model-based Control Synthesis for Heterogeneous Robot Skills", "track": "main", "status": "Poster", "tldr": "", "abstract": "The requirements for real-world manipulation tasks are diverse and often conflicting; some tasks require precise motion while others require force compliance; some tasks require avoidance of certain regions while others require convergence to certain states. Satisfying these varied requirements with a fixed state-action representation and control strategy is challenging, impeding the development of a universal robotic foundation model. In this work, we propose Meta-Control, the first LLM-enabled automatic control synthesis approach that creates customized state representations and control strategies tailored to specific tasks. Our core insight is that a meta-control system can be built to automate the thought process that human experts use to design control systems. Specifically, human experts heavily use a model-based, hierarchical (from abstract to concrete) thought model, then compose various dynamic models and controllers together to form a control system. Meta-Control mimics the thought model and harnesses LLM's extensive control knowledge with Socrates' \"art of midwifery\" to automate the thought process. Meta-Control stands out for its fully model-based nature, allowing rigorous analysis, generalizability, robustness, efficient parameter tuning, and reliable real-time execution.", "keywords": "embodied agent;model-based control;LLM;manipulation", "primary_area": "", "supplementary_material": "", "author": "Tianhao Wei;Liqian Ma;Rui Chen;Weiye Zhao;Changliu Liu", "authorids": "~Tianhao_Wei1;mlq19@mails.tsinghua.edu.cn;~Rui_Chen11;~Weiye_Zhao1;~Changliu_Liu1", "gender": "M;;M;M;F", "homepage": ";;https://ruichen.pub/;https://github.com/CaesarAndylaw;http://www.cs.cmu.edu/~cliu6/index.html", "dblp": "222/5386;;;228/6863;166/3563", "google_scholar": "V22j1C0AAAAJ;;XiUE0wMAAAAJ;P-79KOcAAAAJ;", "orcid": ";;0000-0002-8671-8771;0000-0002-8426-5238;", "linkedin": ";;;;", "or_profile": "~Tianhao_Wei1;mlq19@mails.tsinghua.edu.cn;~Rui_Chen11;~Weiye_Zhao1;~Changliu_Liu1", "aff": "Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;;andrew.cmu.edu;andrew.cmu.edu;cmu.edu", "position": "PhD student;;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nwei2024metacontrol,\ntitle={Meta-Control: Automatic Model-based Control Synthesis for Heterogeneous Robot Skills},\nauthor={Tianhao Wei and Liqian Ma and Rui Chen and Weiye Zhao and Changliu Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=cvVEkS5yij}\n}", "github": "", "project": "", "reviewers": "U2Js;5Tm1;9Tq5", "site": "https://openreview.net/forum?id=cvVEkS5yij", "pdf_size": 0, "rating": "3;3;3", "confidence": "2;4;2", "rating_avg": 3.0, "confidence_avg": 2.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12947304437140974910&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "dUo6j3YURS", "title": "MOSAIC: Modular Foundation Models for Assistive and Interactive Cooking", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present MOSAIC, a modular architecture for coordinating multiple robots to (a) interact with users using natural language and (b) manipulate an open vocabulary of everyday objects. At several levels, MOSAIC employs modularity: it leverages multiple large-scale pre-trained models for high-level tasks like language and image recognition, while using streamlined modules designed for low-level task-specific control. This decomposition allows us to reap the complementary benefits of foundation models and precise, more specialized models, enabling our system to scale to complex tasks that involve coordinating multiple robots and humans. First, we unit-test individual modules with 180 episodes of visuomotor picking, 60 episodes of human motion forecasting, and 46 online user evaluations of the task planner. We then extensively evaluate MOSAIC with 60 end-to-end trials. We discuss crucial design decisions, limitations of the current system, and open challenges in this domain", "keywords": "Foundation Models;Human-Robot Interaction;Model Learning", "primary_area": "", "supplementary_material": "/attachment/8bf0c8480cff9ddc5516008d09bc9b271efb4b0d.zip", "author": "Huaxiaoyue Wang;Kushal Kedia;Juntao Ren;Rahma Abdullah;Atiksh Bhardwaj;Angela Chao;Kelly Y Chen;Nathaniel Chin;Prithwish Dan;Xinyi Fan;Gonzalo Gonzalez-Pumariega;Aditya Kompella;Maximus Adrian Pace;Yash Sharma;Xiangwan Sun;Neha Sunkara;Sanjiban Choudhury", "authorids": "~Huaxiaoyue_Wang1;~Kushal_Kedia1;~Juntao_Ren1;~Rahma_Abdullah1;~Atiksh_Bhardwaj1;~Angela_Chao1;~Kelly_Y_Chen1;~Nathaniel_Chin1;~Prithwish_Dan1;~Xinyi_Fan3;~Gonzalo_Gonzalez-Pumariega1;~Aditya_Kompella1;~Maximus_Adrian_Pace1;~Yash_Sharma3;~Xiangwan_Sun1;~Neha_Sunkara1;~Sanjiban_Choudhury3", "gender": "F;M;M;;M;F;F;M;M;F;M;M;M;M;Not Specified;F;M", "homepage": "https://lunay0yuki.github.io/;https://kushal2000.github.io/;https://jren03.github.io/;;;https://github.com/angelac345;;;https://portfolio-pdan101.vercel.app/;;https://gonzalogonzalezpumariega.com/;;https://maxpace1.github.io;https://yash-s20.github.io/;;https://ns597.github.io/cv;https://www.sanjibanchoudhury.com/", "dblp": "324/6120;;340/8425;;;;;;;;;;;121/9967-4;;;135/8207", "google_scholar": "yweLdycAAAAJ;;https://scholar.google.com/citations?hl=en;;;;;;;;72zQVF8AAAAJ;;;hWLTV6AAAAAJ;j8c6XJgAAAAJ;;https://scholar.google.com/citations?hl=en", "orcid": "0009-0008-9239-1745;;;;;;;;;;0009-0004-5425-7319;;;0009-0009-7428-795X;;;", "linkedin": "yukiwang-hw;;juntaoren/;rahma-abdullah/;atiksh-bhardwaj-b080ab241/;angela-chao-b347b217a;kelly-chen-0417/;nathaniel-chin-5b2301195/;prithwish-dan/;xinyi-vivian-fan/;gonzalogonzalez2000/;aditya-kompella-3aab221b4;maximuspace/;yash-sharma-b2bb04157/;sunny-sun25/;neha-sunkara-34269222b/;", "or_profile": "~Huaxiaoyue_Wang1;~Kushal_Kedia1;~Juntao_Ren1;~Rahma_Abdullah1;~Atiksh_Bhardwaj1;~Angela_Chao1;~Kelly_Y_Chen1;~Nathaniel_Chin1;~Prithwish_Dan1;~Xinyi_Fan3;~Gonzalo_Gonzalez-Pumariega1;~Aditya_Kompella1;~Maximus_Adrian_Pace1;~Yash_Sharma3;~Xiangwan_Sun1;~Neha_Sunkara1;~Sanjiban_Choudhury3", "aff": "Cornell University;Cornell University;Department of Computer Science, Cornell University;Cornell University;Cornell University;Cornell University;Cornell University;Cornell University;Department of Computer Science, Cornell University;Cornell University;Cornell University;Department of Computer Science, Cornell University;Cornell University;Cornell University;Cornell University;Cornell University;Cornell University", "aff_domain": "cornell.edu;cornell.edu;cs.cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu;cs.cornell.edu;cornell.edu;cs.cornell.edu;cs.cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu;cornell.edu", "position": "PhD student;PhD student;Undergrad student;Undergrad student;Undergrad student;Undergrad student;Undergrad student;MS student;Undergrad student;Undergrad student;MS student;MS student;Undergrad student;MS student;Undergrad student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nwang2024mosaic,\ntitle={{MOSAIC}: Modular Foundation Models for Assistive and Interactive Cooking},\nauthor={Huaxiaoyue Wang and Kushal Kedia and Juntao Ren and Rahma Abdullah and Atiksh Bhardwaj and Angela Chao and Kelly Y Chen and Nathaniel Chin and Prithwish Dan and Xinyi Fan and Gonzalo Gonzalez-Pumariega and Aditya Kompella and Maximus Adrian Pace and Yash Sharma and Xiangwan Sun and Neha Sunkara and Sanjiban Choudhury},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dUo6j3YURS}\n}", "github": "https://github.com/portal-cornell/MOSAIC/", "project": "", "reviewers": "ibvm;34B7;MKTe", "site": "https://openreview.net/forum?id=dUo6j3YURS", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 17, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:PaZJyEf53ukJ:scholar.google.com/&scioq=MOSAIC:+Modular+Foundation+Models+for+Assistive+and+Interactive+Cooking&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_unique_norm": "Cornell University", "aff_unique_dep": "", "aff_unique_url": "https://www.cornell.edu", "aff_unique_abbr": "Cornell", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "dXSGw7Cy55", "title": "Contrast Sets for Evaluating Language-Guided Robot Policies", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robot evaluations in language-guided, real world settings are time-consuming and often sample only a small space of potential instructions across complex scenes. In this work, we introduce contrast sets for robotics as an approach to make small, but specific, perturbations to otherwise independent, identically distributed (i.i.d.) test instances. We investigate the relationship between experimenter effort to carry out an evaluation and the resulting estimated test performance as well as the insights that can be drawn from performance on perturbed instances. We use contrast sets to characterize policies at reduced experimenter effort in both a simulated manipulation task and a physical robot vision-and-language navigation task. We encourage the use of contrast set evaluations as a more informative alternative to small scale, i.i.d. demonstrations on physical robots, and as a scalable alternative to industry-scale real world evaluations.", "keywords": "Evaluation;Language-guided robots", "primary_area": "", "supplementary_material": "/attachment/7297b8bc75f5b1459b09916b97dadf6d36b50972.zip", "author": "Abrar Anwar;Rohan Gupta;Jesse Thomason", "authorids": "~Abrar_Anwar1;~Rohan_Gupta3;~Jesse_Thomason1", "gender": "M;M;M", "homepage": "http://abraranwar.github.io/;;https://jessethomason.com/", "dblp": "294/1347.html;;130/2863", "google_scholar": "c6E-5tcAAAAJ;;8BeTDr0AAAAJ", "orcid": "0000-0003-4442-4369;;0000-0001-9199-0633", "linkedin": "abraranwar;rohan---gupta;jesse-thomason-034746171/", "or_profile": "~Abrar_Anwar1;~Rohan_Gupta3;~Jesse_Thomason1", "aff": "University of Southern California;University of Southern California;Amazon", "aff_domain": "usc.edu;usc.edu;amazon.com", "position": "PhD student;Undergrad student;Visiting Academic", "bibtex": "@inproceedings{\nanwar2024contrast,\ntitle={Contrast Sets for Evaluating Language-Guided Robot Policies},\nauthor={Abrar Anwar and Rohan Gupta and Jesse Thomason},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dXSGw7Cy55}\n}", "github": "", "project": "", "reviewers": "DoyM;uVPW;5QZG", "site": "https://openreview.net/forum?id=dXSGw7Cy55", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=944394569805909815&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of Southern California;Amazon", "aff_unique_dep": ";Amazon.com, Inc.", "aff_unique_url": "https://www.usc.edu;https://www.amazon.com", "aff_unique_abbr": "USC;Amazon", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Los Angeles;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "deywgeWmL5", "title": "TLDR: Unsupervised Goal-Conditioned RL via Temporal Distance-Aware Representations", "track": "main", "status": "Poster", "tldr": "", "abstract": "Unsupervised goal-conditioned reinforcement learning (GCRL) is a promising paradigm for developing diverse robotic skills without external supervision. However, existing unsupervised GCRL methods often struggle to cover a wide range of states in complex environments due to their limited exploration and sparse or noisy rewards for GCRL. To overcome these challenges, we propose a novel unsupervised GCRL method that leverages TemporaL Distance-aware Representations (TLDR). Based on temporal distance, TLDR selects faraway goals to initiate exploration and computes intrinsic exploration rewards and goal-reaching rewards. Specifically, our exploration policy seeks states with large temporal distances (i.e. covering a large state space), while the goal-conditioned policy learns to minimize the temporal distance to the goal (i.e. reaching the goal). Our results in six simulated locomotion environments demonstrate that TLDR significantly outperforms prior unsupervised GCRL methods in achieving a wide range of states.", "keywords": "Unsupervised Goal-Conditioned Reinforcement Learning;Temporal Distance-Aware Representations", "primary_area": "", "supplementary_material": "/attachment/a7c0a81a9dad166e605dd8046602746c210cf526.zip", "author": "Junik Bae;Kwanyoung Park;Youngwoon Lee", "authorids": "~Junik_Bae1;~Kwanyoung_Park1;~Youngwoon_Lee1", "gender": ";M;M", "homepage": "https://github.com/heatz123;;https://youngwoon.github.io", "dblp": ";284/1034;117/4767", "google_scholar": ";odFC9mAAAAAJ;CDPa3AgAAAAJ", "orcid": ";;0000-0001-9918-1056", "linkedin": ";;", "or_profile": "~Junik_Bae1;~Kwanyoung_Park1;~Youngwoon_Lee1", "aff": "Seoul National University;Seoul National University;University of California, Berkeley", "aff_domain": "snu.ac.kr;snu.ac.kr;berkeley.edu", "position": "Undergrad student;Undergrad student;Postdoc", "bibtex": "@inproceedings{\nbae2024tldr,\ntitle={{TLDR}: Unsupervised Goal-Conditioned {RL} via Temporal Distance-Aware Representations},\nauthor={Junik Bae and Kwanyoung Park and Youngwoon Lee},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=deywgeWmL5}\n}", "github": "https://github.com/heatz123/tldr", "project": "", "reviewers": "7NjC;nYH6;uLZM", "site": "https://openreview.net/forum?id=deywgeWmL5", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;3;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4115033728200583818&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1", "aff_unique_norm": "Seoul National University;University of California, Berkeley", "aff_unique_dep": ";", "aff_unique_url": "https://www.snu.ac.kr;https://www.berkeley.edu", "aff_unique_abbr": "SNU;UC Berkeley", "aff_campus_unique_index": "1", "aff_campus_unique": ";Berkeley", "aff_country_unique_index": "0;0;1", "aff_country_unique": "South Korea;United States" }, { "id": "dsxmR6lYlg", "title": "Reinforcement Learning with Foundation Priors: Let Embodied Agent Efficiently Learn on Its Own", "track": "main", "status": "Poster", "tldr": "", "abstract": "Reinforcement learning (RL) is a promising approach for solving robotic manipulation tasks.\nHowever, it is challenging to apply the RL algorithms directly in the real world.\nFor one thing, RL is data-intensive and typically requires millions of interactions with environments, which are impractical in real scenarios. \nFor another, it is necessary to make heavy engineering efforts to design reward functions manually. \nTo address these issues, we leverage foundation models in this paper. \nWe propose Reinforcement Learning with Foundation Priors (RLFP) to utilize guidance and feedback from policy, value, and success-reward foundation models.\nWithin this framework, we introduce the Foundation-guided Actor-Critic (FAC) algorithm, which enables embodied agents to explore more efficiently with automatic reward functions.\nThe benefits of our framework are threefold: (1) \\textit{sample efficient}; (2) \\textit{minimal and effective reward engineering}; (3) \\textit{agnostic to foundation model forms and robust to noisy priors}. Our method achieves remarkable performances in various manipulation tasks on both real robots and in simulation. Across 5 dexterous tasks with real robots, FAC achieves an average success rate of 86\\% after one hour of real-time learning. \nAcross 8 tasks in the simulated Meta-world, FAC achieves 100\\% success rates in 7/8 tasks under less than 100k frames (about 1-hour training), outperforming baseline methods with manual-designed rewards in 1M frames. \nWe believe the RLFP framework can enable future robots to explore and learn autonomously in the physical world for more tasks.", "keywords": "Reinforcement Learning;Foundation Models;Robotics;VLMs", "primary_area": "", "supplementary_material": "/attachment/d8fe88f6b7c288490c28705d7ab48fa23ec00eff.zip", "author": "Weirui Ye;Yunsheng Zhang;Haoyang Weng;Xianfan Gu;Shengjie Wang;Tong Zhang;Mengchen Wang;Pieter Abbeel;Yang Gao", "authorids": "~Weirui_Ye1;~Yunsheng_Zhang1;~Haoyang_Weng1;~Xianfan_Gu1;~Shengjie_Wang2;~Tong_Zhang23;~Mengchen_Wang3;~Pieter_Abbeel2;~Yang_Gao1", "gender": "M;M;M;M;;M;M;M;M", "homepage": "https://yewr.github.io/;https://isa233.github.io/;;https://shengjiewang-jason.github.io/;https://tongzhangthu.github.io/;https://github.com/fjlafafa;https://people.eecs.berkeley.edu/~pabbeel/;http://yang-gao.weebly.com;https://egalahad.github.io/", "dblp": "245/3595;;344/1664;;;;;89/4402-29;", "google_scholar": "_GgST9AAAAAJ;;;;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.tw/citations?user=vtwH6GkAAAAJ;https://scholar.google.com/citations?hl=en;", "orcid": ";;0000-0002-7860-9460;;;;;;", "linkedin": ";;;;;;;yang-gao-45245348/;", "or_profile": "~Weirui_Ye1;~Yunsheng_Zhang1;~Xianfan_Gu1;~Shengjie_Wang2;~Tong_Zhang23;~Mengchen_Wang3;~Pieter_Abbeel2;~Yang_Gao1;~Weng_Haoyang1", "aff": "Tsinghua University;;Shanghai Qi Zhi Institute;Tsinghua University;Tsinghua University;Tsinghua University;Covariant;Tsinghua University;Tsinghua University", "aff_domain": "tsinghua.edu.cn;;sqz.ac.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;mail.tsinghua.edu.cn;covariant.ai;tsinghua.edu.cn;tsinghua.edu.cn", "position": "PhD student;;Researcher;PhD student;PhD student;Undergrad student;Founder;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nye2024reinforcement,\ntitle={Reinforcement Learning with Foundation Priors: Let Embodied Agent Efficiently Learn on Its Own},\nauthor={Weirui Ye and Yunsheng Zhang and Haoyang Weng and Xianfan Gu and Shengjie Wang and Tong Zhang and Mengchen Wang and Pieter Abbeel and Yang Gao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=dsxmR6lYlg}\n}", "github": "https://github.com/YeWR/RLFP", "project": "", "reviewers": "bb5a;kNo7;M4P9", "site": "https://openreview.net/forum?id=dsxmR6lYlg", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;5", "rating_avg": 3.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13202366981818240003&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0;2;0;0", "aff_unique_norm": "Tsinghua University;Shanghai Qi Zhi Institute;Covariant", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tsinghua.edu.cn;https://www.qz.io;", "aff_unique_abbr": "THU;;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China;" }, { "id": "eJHy0AF5TO", "title": "RiEMann: Near Real-Time SE(3)-Equivariant Robot Manipulation without Point Cloud Segmentation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present RiEMann, an end-to-end near Real-time SE(3)-Equivariant Robot Manipulation imitation learning framework from scene point cloud input. Compared to previous methods that rely on descriptor field matching, RiEMann directly predicts the target actions for manipulation without any object segmentation. RiEMann can efficiently train the visuomotor policy from scratch with 5 to 10 demonstrations for a manipulation task, generalizes to unseen SE(3) transformations and instances of target objects, resists visual interference of distracting objects, and follows the near real-time pose change of the target object. The scalable SE(3)-equivariant action space of RiEMann supports both pick-and-place tasks and articulated object manipulation tasks. In simulation and real-world 6-DOF robot manipulation experiments, we test RiEMann on 5 categories of manipulation tasks with a total of 25 variants and show that RiEMann outperforms baselines in both task success rates and SE(3) geodesic distance errors (reduced by 68.6%), and achieves 5.4 frames per second (fps) network inference speed.", "keywords": "SE(3)-Equivariance;Manipulation;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/ba3c6f63a45ee79506abf72f9244f4c3d38a0a19.zip", "author": "Chongkai Gao;Zhengrong Xue;Shuying Deng;Tianhai Liang;Siqi Yang;Lin Shao;Huazhe Xu", "authorids": "~Chongkai_Gao1;~Zhengrong_Xue1;~Shuying_Deng1;~Tianhai_Liang1;~Siqi_Yang4;~Lin_Shao2;~Huazhe_Xu1", "gender": "M;;F;M;M;M;M", "homepage": "https://chongkaigao.com/;https://steven-xzr.github.io;https://shuyingdeng.github.io/;https://github.com/Tinhal;http://www.google.com;https://linsats.github.io/;http://hxu.rocks", "dblp": "295/8658;272/4241;;;;26/8546-2;164/9006", "google_scholar": "l_mOqY8AAAAJ;LO3pKmwAAAAJ;;;;https://scholar.google.com/citations?hl=en;t9HPFawAAAAJ", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Chongkai_Gao1;~Zhengrong_Xue1;~Shuying_Deng1;~Tianhai_Liang1;~Siqi_Yang4;~Lin_Shao2;~Huazhe_Xu1", "aff": "National University of Singapore;Tsinghua University;Tsinghua University;Harbin Institute of Technology, Shenzhen;Tsinghua University;National University of Singapore;Tsinghua University", "aff_domain": "nus.edu.sg;tsinghua.edu.cn;mails.tsinghua.edu.cn;hit.edu.cn;mails.tsinghua.edu.cn;nus.edu.sg;tsinghua.edu.cn", "position": "PhD student;PhD student;Undergrad student;Undergrad student;Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\ngao2024riemann,\ntitle={Ri{EM}ann: Near Real-Time {SE}(3)-Equivariant Robot Manipulation without Point Cloud Segmentation},\nauthor={Chongkai Gao and Zhengrong Xue and Shuying Deng and Tianhai Liang and Siqi Yang and Lin Shao and Huazhe Xu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eJHy0AF5TO}\n}", "github": "https://github.com/HeegerGao/RiEMann", "project": "", "reviewers": "9VAx;UjoM;HXLL", "site": "https://openreview.net/forum?id=eJHy0AF5TO", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16538373089696135198&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;2;1;0;1", "aff_unique_norm": "National University of Singapore;Tsinghua University;Harbin Institute of Technology", "aff_unique_dep": ";;", "aff_unique_url": "https://www.nus.edu.sg;https://www.tsinghua.edu.cn;http://en.hhit.edu.cn/", "aff_unique_abbr": "NUS;THU;HIT", "aff_campus_unique_index": "1", "aff_campus_unique": ";Shenzhen", "aff_country_unique_index": "0;1;1;1;1;0;1", "aff_country_unique": "Singapore;China" }, { "id": "eTRncsYYdv", "title": "Solving Offline Reinforcement Learning with Decision Tree Regression", "track": "main", "status": "Poster", "tldr": "", "abstract": "This study presents a novel approach to addressing offline reinforcement learning (RL) problems by reframing them as regression tasks that can be effectively solved using Decision Trees. Mainly, we introduce two distinct frameworks: return-conditioned and return-weighted decision tree policies (RCDTP and RWDTP), both of which achieve notable speed in agent training as well as inference, with training typically lasting less than a few minutes. Despite the simplification inherent in this reformulated approach to offline RL, our agents demonstrate performance that is at least on par with the established methods. We evaluate our methods on D4RL datasets for locomotion and manipulation, as well as other robotic tasks involving wheeled and flying robots. Additionally, we assess performance in delayed/sparse reward scenarios and highlight the explainability of these policies through action distribution and feature importance.", "keywords": "Offline Reinforcement Learning;Decision Trees", "primary_area": "", "supplementary_material": "", "author": "Prajwal Koirala;Cody Fleming", "authorids": "~Prajwal_Koirala1;~Cody_Fleming2", "gender": "M;", "homepage": ";http://coordinatedsystemslab.com", "dblp": ";122/4774", "google_scholar": ";", "orcid": ";", "linkedin": "prajwal-koirala-a332a4184/;", "or_profile": "~Prajwal_Koirala1;~Cody_Fleming2", "aff": "Iowa State University;Iowa State University", "aff_domain": "iastate.edu;iastate.edu", "position": "MS student;Associate Professor", "bibtex": "@inproceedings{\nkoirala2024solving,\ntitle={Solving Offline Reinforcement Learning with Decision Tree Regression},\nauthor={Prajwal Koirala and Cody Fleming},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eTRncsYYdv}\n}", "github": "https://github.com/PrajwalKoirala/Offline-Reinforcement-Learning-with-Decision-Tree-Regression/tree/main", "project": "", "reviewers": "c4Yz;NHhZ;i7UK", "site": "https://openreview.net/forum?id=eTRncsYYdv", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;2;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": 0.5, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2638206652966394699&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Iowa State University", "aff_unique_dep": "", "aff_unique_url": "https://www.iastate.edu", "aff_unique_abbr": "ISU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "eU5E0oTtpS", "title": "Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large Language Models (LLM) have emerged as a tool for robots to generate task plans using common sense reasoning. For the LLM to generate actionable plans, scene context must be provided, often through a map. Recent works have shifted from explicit maps with fixed semantic classes to implicit open vocabulary maps based on queryable embeddings capable of representing any semantic class. However, embeddings cannot directly report the scene context as they are implicit, requiring further processing for LLM integration. To address this, we propose an explicit text-based map that can represent thousands of semantic classes while easily integrating with LLMs due to their text-based nature by building upon large-scale image recognition models. We study how entities in our map can be localized and show through evaluations that our text-based map localizations perform comparably to those from open vocabulary maps while using two to four orders of magnitude less memory. Real-robot experiments demonstrate the grounding of an LLM with the text-based map to solve user tasks.", "keywords": "Scene Understanding;Large Language Models", "primary_area": "", "supplementary_material": "/attachment/2f75a3e2174832a0ee231d4b6a6c35ca8982b753.zip", "author": "Mike Zhang;Kaixian Qu;Vaishakh Patil;Cesar Cadena;Marco Hutter", "authorids": "~Mike_Zhang2;~Kaixian_Qu1;~Vaishakh_Patil1;~Cesar_Cadena1;~Marco_Hutter1", "gender": ";M;M;;M", "homepage": "https://clams-casino.github.io/;https://mavt.ethz.ch/people/person-detail.MjY1MTQ3.TGlzdC81NTksLTE3MDY5NzgwMTc=.html;;https://n.ethz.ch/~cesarc/;http://www.rsl.ethz.ch", "dblp": ";;255/5070;43/7748;04/2753", "google_scholar": ";9TgpzJQAAAAJ;aB04078AAAAJ;aOns5HQAAAAJ;https://scholar.google.ch/citations?user=DO3quJYAAAAJ", "orcid": ";;;0000-0002-2972-6011;0000-0002-4285-4990", "linkedin": ";;;;", "or_profile": "~Mike_Zhang2;~Kaixian_Qu1;~Vaishakh_Patil1;~Cesar_Cadena1;~Marco_Hutter1", "aff": "ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETHZ - ETH Zurich;ETH Zurich;ETHZ - ETH Zurich", "aff_domain": "ethz.ch;ethz.ch;ethz.ch;ethz.ch;ethz.ch", "position": "PhD student;PhD student;Postdoc;Senior Scientist;Associate Professor", "bibtex": "@inproceedings{\nzhang2024tag,\ntitle={Tag Map: A Text-Based Map for Spatial Reasoning and Navigation with Large Language Models},\nauthor={Mike Zhang and Kaixian Qu and Vaishakh Patil and Cesar Cadena and Marco Hutter},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eU5E0oTtpS}\n}", "github": "", "project": "", "reviewers": "99N9;Bsyi;W8CK;4KgS", "site": "https://openreview.net/forum?id=eU5E0oTtpS", "pdf_size": 0, "rating": "3;3;3;3", "confidence": "5;3;3;3", "rating_avg": 3.0, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17268856250353138554&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "ETH Zurich", "aff_unique_dep": "", "aff_unique_url": "https://www.ethz.ch", "aff_unique_abbr": "ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Switzerland" }, { "id": "edP2dmingV", "title": "Large Scale Mapping of Indoor Magnetic Field by Local and Sparse Gaussian Processes", "track": "main", "status": "Poster", "tldr": "", "abstract": "Magnetometer-based indoor navigation uses variations in the magnetic field to determine the robot's location. For that, a magnetic map of the environment has to be built beforehand from a collection of localized magnetic measurements. Existing solutions built on sparse Gaussian Process (GP) regression do not scale well to large environments, being either slow or resulting in discontinuous prediction. In this paper, we propose to model the magnetic field of large environments based on GP regression. We first modify a deterministic training conditional sparse GP by accounting for magnetic field physics to map small environments efficiently. We then scale the model on larger scenes by introducing a local expert aggregation framework. It splits the scene into subdomains, fits a local expert on each, and then aggregates expert predictions in a differentiable and probabilistic way. We evaluate our model on real and simulated data and show that we can smoothly map a three-story building in a few hundred milliseconds.", "keywords": "Gaussian process regression;magnetic field maps;indoor localization", "primary_area": "", "supplementary_material": "/attachment/5b675296e73d73b231aae0c4bdc56f9ccada4aca.zip", "author": "Iad ABDUL-RAOUF;Vincent Gay-Bellile;Cyril JOLY;Steve Bourgeois;Alexis Paljic", "authorids": "~Iad_ABDUL-RAOUF1;~Vincent_Gay-Bellile1;~Cyril_JOLY1;~Steve_Bourgeois1;alexis.paljic@mines-paristech.fr", "gender": "M;M;M;M;", "homepage": ";;;;", "dblp": ";42/2276;47/8924;34/4622;", "google_scholar": "FjFtHjcAAAAJ;kUVG8pIAAAAJ;p0R5vbsAAAAJ;;", "orcid": ";;0000-0002-2899-0179;;", "linkedin": "iad-abdul-raouf-702059177/;;;;", "or_profile": "~Iad_ABDUL-RAOUF1;~Vincent_Gay-Bellile1;~Cyril_JOLY1;~Steve_Bourgeois1;alexis.paljic@mines-paristech.fr", "aff": "Mines ParisTech;CEA;Mines ParisTech;CEA;", "aff_domain": "minesparis.psl.eu;cea.fr;mines-paristech.fr;cea.fr;", "position": "PhD student;Researcher;Associate Professor;Researcher;", "bibtex": "@inproceedings{\nabdul-raouf2024large,\ntitle={Large Scale Mapping of Indoor Magnetic Field by Local and Sparse Gaussian Processes},\nauthor={Iad ABDUL-RAOUF and Vincent Gay-Bellile and Cyril JOLY and Steve Bourgeois and Alexis Paljic},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=edP2dmingV}\n}", "github": "", "project": "", "reviewers": "XHZx;MA22;KGGy", "site": "https://openreview.net/forum?id=edP2dmingV", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:cFXQa3-SAHIJ:scholar.google.com/&scioq=Large+Scale+Mapping+of+Indoor+Magnetic+Field+by+Local+and+Sparse+Gaussian+Processes&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;1;0;1", "aff_unique_norm": "MINES ParisTech;Commissariat \u00e0 l'\u00c9nergie Atomique et aux \u00c9nergies Alternatives", "aff_unique_dep": ";", "aff_unique_url": "https://www.mines-paristech.fr;https://www cea fr", "aff_unique_abbr": "Mines ParisTech;CEA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "France" }, { "id": "eeoX7tCoK2", "title": "Shelf-Supervised Cross-Modal Pre-Training for 3D Object Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "State-of-the-art 3D object detectors are often trained on massive labeled datasets. However, annotating 3D bounding boxes remains prohibitively expensive and time-consuming, particularly for LiDAR. Instead, recent works demonstrate that self-supervised pre-training with unlabeled data can improve detection accuracy with limited labels. Contemporary methods adapt best-practices for self-supervised learning from the image domain to point clouds (such as contrastive learning). However, publicly available 3D datasets are considerably smaller and less diverse than those used for image-based self-supervised learning, limiting their effectiveness. We do note, however, that such data is naturally collected in a multimodal fashion, often paired with images. Rather than pre-training with only self-supervised objectives, we argue that it is better to bootstrap point cloud representations using image-based foundation models trained on internet-scale image data. Specifically, we propose a shelf-supervised approach (e.g. supervised with off-the-shelf image foundation models) for generating zero-shot 3D bounding boxes from paired RGB and LiDAR data. Pre-training 3D detectors with such pseudo-labels yields significantly better semi-supervised detection accuracy than prior self-supervised pretext tasks. Importantly, we show that image-based shelf-supervision is helpful for training LiDAR-only and multi-modal (RGB + LiDAR) detectors. We demonstrate the effectiveness of our approach on nuScenes and WOD, significantly improving over prior work in limited data settings.", "keywords": "Shelf-Supervised 3D Object Detection;Vision-Language Models;Autonomous Vehicles", "primary_area": "", "supplementary_material": "/attachment/051ca3f45fdfec1075196256f2320406acec64cc.zip", "author": "Mehar Khurana;Neehar Peri;James Hays;Deva Ramanan", "authorids": "mehar21541@iiitd.ac.in;~Neehar_Peri1;~James_Hays1;~Deva_Ramanan1", "gender": ";M;M;M", "homepage": ";http://neeharperi.com;http://www.cc.gatech.edu/~hays/;https://www.cs.cmu.edu/~deva/", "dblp": ";241/5094;57/5958;49/488", "google_scholar": ";X3cGY7wAAAAJ;vjZrDKQAAAAJ;9B8PoXUAAAAJ", "orcid": ";;0000-0001-7016-4252;", "linkedin": ";neeharperi/;james-h-hays/;", "or_profile": "mehar21541@iiitd.ac.in;~Neehar_Peri1;~James_Hays1;~Deva_Ramanan1", "aff": ";Carnegie Mellon University;Georgia Institute of Technology;School of Computer Science, Carnegie Mellon University", "aff_domain": ";cmu.edu;gatech.edu;cs.cmu.edu", "position": ";PhD student;Associate professor;Full Professor", "bibtex": "@inproceedings{\nkhurana2024shelfsupervised,\ntitle={Shelf-Supervised Cross-Modal Pre-Training for 3D Object Detection},\nauthor={Mehar Khurana and Neehar Peri and James Hays and Deva Ramanan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=eeoX7tCoK2}\n}", "github": "https://github.com/meharkhurana03/cm3d", "project": "", "reviewers": "rkuQ;XAhw;yS2F;gB95", "site": "https://openreview.net/forum?id=eeoX7tCoK2", "pdf_size": 0, "rating": "1;1;2;3", "confidence": "5;4;4;3", "rating_avg": 1.75, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": -0.8528028654224417, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Cmq2EpqPadoJ:scholar.google.com/&scioq=Shelf-Supervised+Cross-Modal+Pre-Training+for+3D+Object+Detection&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;0", "aff_unique_norm": "Carnegie Mellon University;Georgia Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.cmu.edu;https://www.gatech.edu", "aff_unique_abbr": "CMU;Georgia Tech", "aff_campus_unique_index": "1", "aff_campus_unique": ";Pittsburgh", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "evCXwlCMIi", "title": "Learning to Walk from Three Minutes of Real-World Data with Semi-structured Dynamics Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Traditionally, model-based reinforcement learning (MBRL) methods exploit neural networks as flexible function approximators to represent $\\textit{a priori}$ unknown environment dynamics. However, training data are typically scarce in practice, and these black-box models often fail to generalize. Modeling architectures that leverage known physics can substantially reduce the complexity of system-identification, but break down in the face of complex phenomena such as contact. We introduce a novel framework for learning semi-structured dynamics models for contact-rich systems which seamlessly integrates structured first principles modeling techniques with black-box auto-regressive models. Specifically, we develop an ensemble of probabilistic models to estimate external forces, conditioned on historical observations and actions, and integrate these predictions using known Lagrangian dynamics. With this semi-structured approach, we can make accurate long-horizon predictions with substantially less data than prior methods. We leverage this capability and propose Semi-Structured Reinforcement Learning ($\\texttt{SSRL}$) a simple model-based learning framework which pushes the sample complexity boundary for real-world learning. We validate our approach on a real-world Unitree Go1 quadruped robot, learning dynamic gaits -- from scratch -- on both hard and soft surfaces with just a few minutes of real-world data. Video and code are available at: https://sites.google.com/utexas.edu/ssrl", "keywords": "Model-Based Reinforcement Learning;Physics-Based Models", "primary_area": "", "supplementary_material": "/attachment/2f995359edb81aae5a7d9ee211e0070b483fa4c8.zip", "author": "Jacob Levy;Tyler Westenbroek;David Fridovich-Keil", "authorids": "~Jacob_Levy1;~Tyler_Westenbroek1;~David_Fridovich-Keil1", "gender": "M;M;M", "homepage": ";https://scholar.google.com/citations?user=aqSKwDQAAAAJ&hl=en;https://dfridovi.github.io", "dblp": ";;203/5260", "google_scholar": "LLmcf-oAAAAJ;;gqyTnpQAAAAJ", "orcid": ";;", "linkedin": "jacob-levy-13b85069/;;", "or_profile": "~Jacob_Levy1;~Tyler_Westenbroek1;~David_Fridovich-Keil1", "aff": "University of Texas at Austin;;University of Texas at Austin", "aff_domain": "utexas.edu;;utexas.edu", "position": "PhD student;;Assistant Professor", "bibtex": "@inproceedings{\nlevy2024learning,\ntitle={Learning to Walk from Three Minutes of Real-World Data with Semi-structured Dynamics Models},\nauthor={Jacob Levy and Tyler Westenbroek and David Fridovich-Keil},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=evCXwlCMIi}\n}", "github": "https://github.com/CLeARoboticsLab/ssrl", "project": "", "reviewers": "bENK;V9T9;reP2", "site": "https://openreview.net/forum?id=evCXwlCMIi", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=690003902181232721&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "University of Texas at Austin", "aff_unique_dep": "", "aff_unique_url": "https://www.utexas.edu", "aff_unique_abbr": "UT Austin", "aff_campus_unique_index": "0;0", "aff_campus_unique": "Austin", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "fC0wWeXsVm", "title": "Learning Robot Soccer from Egocentric Vision with Deep Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We apply multi-agent deep reinforcement learning (RL) to train end-to-end robot soccer policies with fully onboard computation and sensing via egocentric RGB vision. This setting reflects many challenges of real-world robotics, including active perception, agile full-body control, and long-horizon planning in a dynamic, partially-observable, multi-agent domain. We rely on large-scale, simulation-based data generation to obtain complex behaviors from egocentric vision which can be successfully transferred to physical robots using low-cost sensors.\nTo achieve adequate visual realism, our simulation combines rigid-body physics with learned, realistic rendering via multiple Neural Radiance Fields (NeRFs). We combine teacher-based multi-agent RL and cross-experiment data reuse to enable the discovery of sophisticated soccer strategies. We analyze active-perception behaviors including object tracking and ball seeking that emerge when simply optimizing perception-agnostic soccer play. The agents display equivalent levels of performance and agility as policies with access to privileged, ground-truth state. To our knowledge, this paper constitutes a first demonstration of end-to-end training for multi-agent robot soccer, mapping raw pixel observations to joint-level actions that can be deployed in the real world.", "keywords": "robotics;deep reinforcement learning", "primary_area": "", "supplementary_material": "/attachment/2e99ba7e5712f61a4994b88bfdfdbb76a4561c8e.zip", "author": "Dhruva Tirumala;Markus Wulfmeier;Ben Moran;Sandy Huang;Jan Humplik;Guy Lever;Tuomas Haarnoja;Leonard Hasenclever;Arunkumar Byravan;Nathan Batchelor;Neil sreendra;Kushal Patel;Marlon Gwira;Francesco Nori;Martin Riedmiller;Nicolas Heess", "authorids": "~Dhruva_Tirumala1;~Markus_Wulfmeier1;~Ben_Moran2;~Sandy_Huang1;~Jan_Humplik1;~Guy_Lever1;~Tuomas_Haarnoja1;~Leonard_Hasenclever1;~Arunkumar_Byravan1;~Nathan_Batchelor1;~Neil_sreendra1;~Kushal_Patel1;~Marlon_Gwira1;~Francesco_Nori2;~Martin_Riedmiller1;~Nicolas_Heess1", "gender": "M;M;F;M;M;M;M;M;M;M;M;M;M;M;;", "homepage": ";;https://shhuang.github.io/;;;;;https://homes.cs.washington.edu/~barun/;;;;;;https://www.riedmiller.me/;;", "dblp": "166/1552;;153/7841;215/9213;52/6149;80/9963;150/1667;151/9400;;;;;21/3290;;76/9181;190/7697.html", "google_scholar": ";;eurA6WgAAAAJ;YE9w2BsAAAAJ;;VT7peyEAAAAJ;https://scholar.google.co.uk/citations?user=dD-3S4QAAAAJ;obYwWiMAAAAJ;;;;;AqlbAj8AAAAJ;1gVfqpcAAAAJ;79k7bGEAAAAJ;HqKq-2YAAAAJ", "orcid": ";0000-0002-9254-662X;;;;;;;;;;;0000-0003-3763-6873;;;", "linkedin": ";;;;;tuomas-haarnoja;;;nathan-batchelor-a62456164?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;neil-sreendra-778b63b6;kushal-patel-66591a148/;marlon-gwira-a2bb1310a/;;;;", "or_profile": "~Markus_Wulfmeier1;~Ben_Moran2;~Sandy_Huang1;~Jan_Humplik1;~Guy_Lever1;~Tuomas_Haarnoja1;~Leonard_Hasenclever1;~Arunkumar_Byravan1;~Nathan_Batchelor1;~Neil_sreendra1;~Kushal_Patel1;~Marlon_Gwira1;~Francesco_Nori2;~Martin_Riedmiller1;~Nicolas_Heess1;~Dhruva_TB1", "aff": "Google DeepMind;Google DeepMind;Google DeepMind;Google DeepMind;;Google DeepMind;Google DeepMind;Google;Google;;;;Google DeepMind;;Google DeepMind;University College London", "aff_domain": "deepmind.com;deepmind.com;deepmind.com;google.com;;deepmind.com;google.com;google.com;deepmind.com;;;;deepmind.com;;google.com;ucl.ac.uk", "position": "Research Scientist;Researcher;Research Scientist;Research scientist;;Research Scientist;Research Scientist;Research Scientist;Instructor;;;;Research Scientist;;Research Scientist;PhD student", "bibtex": "@inproceedings{\ntirumala2024learning,\ntitle={Learning Robot Soccer from Egocentric Vision with Deep Reinforcement Learning},\nauthor={Dhruva Tirumala and Markus Wulfmeier and Ben Moran and Sandy Huang and Jan Humplik and Guy Lever and Tuomas Haarnoja and Leonard Hasenclever and Arunkumar Byravan and Nathan Batchelor and Neil sreendra and Kushal Patel and Marlon Gwira and Francesco Nori and Martin Riedmiller and Nicolas Heess},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fC0wWeXsVm}\n}", "github": "", "project": "", "reviewers": "4VaL;PVvE;arj7", "site": "https://openreview.net/forum?id=fC0wWeXsVm", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 16, "corr_rating_confidence": 1.0, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13041990536285361350&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0;0;0;0;0;1", "aff_unique_norm": "Google;University College London", "aff_unique_dep": "Google DeepMind;", "aff_unique_url": "https://deepmind.com;https://www.ucl.ac.uk", "aff_unique_abbr": "DeepMind;UCL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;0;0;0;0;0;1;1;0;0;0", "aff_country_unique": "United Kingdom;United States" }, { "id": "fCDOfpTCzZ", "title": "InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment", "track": "main", "status": "Poster", "tldr": "", "abstract": "Enabling robots to navigate following diverse language instructions in unexplored environments is an attractive goal for human-robot interaction. However, this goal is challenging because different navigation tasks require different strategies. The scarcity of instruction navigation data hinders training an instruction navigation model with varied strategies. Therefore, previous methods are all constrained to one specific type of navigation instruction. In this work, we propose InstructNav, a generic instruction navigation system. InstructNav makes the first endeavor to handle various instruction navigation tasks without any navigation training or pre-built maps. To reach this goal, we introduce Dynamic Chain-of-Navigation (DCoN) to unify the planning process for different types of navigation instructions. Furthermore, we propose Multi-sourced Value Maps to model key elements in instruction navigation so that linguistic DCoN planning can be converted into robot actionable trajectories. With InstructNav, we complete the R2R-CE task in a zero-shot way for the first time and outperform many task-training methods. Besides, InstructNav also surpasses the previous SOTA method by 10.48% on the zero-shot Habitat ObjNav and by 86.34% on demand-driven navigation DDN. Real robot experiments on diverse indoor scenes further demonstrate our method's robustness in coping with the environment and instruction variations.", "keywords": "Generic Instruction Navigation;Zero-shot;Unexplored Environment", "primary_area": "", "supplementary_material": "/attachment/3633c15f8475c2c4dfcf9e23cf0f833c64c49a0d.zip", "author": "Yuxing Long;Wenzhe Cai;Hongcheng Wang;Guanqi Zhan;Hao Dong", "authorids": "~Yuxing_Long1;~Wenzhe_Cai1;~Hongcheng_Wang6;~Guanqi_Zhan1;~Hao_Dong3", "gender": "M;M;M;;M", "homepage": "https://lyx0501.github.io/;https://wzcai99.github.io/;;https://www.robots.ox.ac.uk/~guanqi/;https://zsdonghao.github.io", "dblp": "337/1595;261/2706;76/1170;254/2030;14/1525-3.html", "google_scholar": "UqQ41BIAAAAJ;NHQcCyAAAAAJ;;f_m4WJIAAAAJ;xLFL4sMAAAAJ", "orcid": ";;;;0000-0003-2261-9122", "linkedin": ";;%E9%B8%BF%E9%93%96-%E7%8E%8B-4a66451b9/;;", "or_profile": "~Yuxing_Long1;~Wenzhe_Cai1;~Hongcheng_Wang6;~Guanqi_Zhan1;~Hao_Dong3", "aff": "Beijing University of Posts and Telecommunications;Southeast University;Peking University;University of Oxford;Peking University", "aff_domain": "bupt.edu.cn;seu.edu.cn;pku.edu.cn;ox.ac.uk;pku.edu.cn", "position": "MS student;PhD student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nlong2024instructnav,\ntitle={InstructNav: Zero-shot System for Generic Instruction Navigation in Unexplored Environment},\nauthor={Yuxing Long and Wenzhe Cai and Hongcheng Wang and Guanqi Zhan and Hao Dong},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fCDOfpTCzZ}\n}", "github": "https://github.com/LYX0501/InstructNav", "project": "", "reviewers": "nMyn;dRHf;3AfH", "site": "https://openreview.net/forum?id=fCDOfpTCzZ", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 34, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7459998912543260083&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Beijing University of Posts and Telecommunications;Southeast University;Peking University;University of Oxford", "aff_unique_dep": ";;;", "aff_unique_url": "http://www.bupt.edu.cn/;https://www.seu.edu.cn/;http://www.pku.edu.cn;https://www.ox.ac.uk", "aff_unique_abbr": "BUPT;SEU;Peking U;Oxford", "aff_campus_unique_index": "0", "aff_campus_unique": "Beijing;", "aff_country_unique_index": "0;0;0;1;0", "aff_country_unique": "China;United Kingdom" }, { "id": "fDRO4NHEwZ", "title": "VIRL: Self-Supervised Visual Graph Inverse Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning dense reward functions from unlabeled videos for reinforcement learning exhibits scalability due to the vast diversity and quantity of video resources. Recent works use visual features or graph abstractions in videos to measure task progress as rewards, which either deteriorate in unseen domains or capture spatial information while overlooking visual details. We propose $\\textbf{V}$isual-Graph $\\textbf{I}$nverse $\\textbf{R}$einforcement $\\textbf{L}$earning (VIRL), a self-supervised method that synergizes low-level visual features and high-level graph abstractions from frames to graph representations for reward learning. VIRL utilizes a visual encoder that extracts object-wise features for graph nodes and a graph encoder that derives properties from graphs constructed from detected objects in each frame. The encoded representations are enforced to align videos temporally and reconstruct in-scene objects. The pretrained visual graph encoder is then utilized to construct a dense reward function for policy learning by measuring latent distances between current frames and the goal frame. Our empirical evaluation on the X-MAGICAL and Robot Visual Pusher benchmark demonstrates that VIRL effectively handles tasks necessitating both granular visual attention and broader global feature consideration, and exhibits robust generalization to $\\textit{extrapolation}$ tasks and domains not seen in demonstrations. Our policy for the robotic task also achieves the highest success rate in real-world robot experiments.", "keywords": "Inverse Reinforcement Learning;Learning from Video;Graph Network", "primary_area": "", "supplementary_material": "/attachment/e0514d57cc9862e7756e4cb7ad68ac5a19a74922.zip", "author": "Lei Huang;Weijia Cai;Zihan Zhu;Chen Feng;Helge Rhodin;Zhengbo Zou", "authorids": "~Lei_Huang11;~Weijia_Cai1;zzhu12@student.ubc.ca;~Chen_Feng2;~Helge_Rhodin5;~Zhengbo_Zou1", "gender": "M;;;M;;", "homepage": "https://leihhhuang.github.io/;;;https://ai4ce.github.io/;;https://zzzzzbbzzzzz.github.io/zhengbozou.github.io/", "dblp": ";;;01/161-2;;", "google_scholar": "qM_o0AkAAAAJ;AqGGowsAAAAJ;;YeG8ZM0AAAAJ;;-p51P1kAAAAJ", "orcid": ";;;0000-0003-3211-1576;;", "linkedin": "lei-huang11/;;;simbaforrest/;;", "or_profile": "~Lei_Huang11;~Weijia_Cai1;zzhu12@student.ubc.ca;~Chen_Feng2;~Helge_Rhodin5;~Zhengbo_Zou1", "aff": "University of British Columbia;University of British Columbia;;New York University;;Columbia University", "aff_domain": "ubc.ca;ubc.ca;;nyu.edu;;columbia.edu", "position": "Research assistant;PhD student;;Assistant Professor;;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024virl,\ntitle={{VIRL}: Self-Supervised Visual Graph Inverse Reinforcement Learning},\nauthor={Lei Huang and Weijia Cai and Zihan Zhu and Chen Feng and Helge Rhodin and Zhengbo Zou},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fDRO4NHEwZ}\n}", "github": "", "project": "", "reviewers": "SzPu;M2rM;4TrT;avVr", "site": "https://openreview.net/forum?id=fDRO4NHEwZ", "pdf_size": 0, "rating": "3;3;3;3", "confidence": "4;4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:-omJrS6bfZYJ:scholar.google.com/&scioq=VIRL:+Self-Supervised+Visual+Graph+Inverse+Reinforcement+Learning&hl=en&as_sdt=0,33", "gs_version_total": 0, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of British Columbia;New York University;Columbia University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.ubc.ca;https://www.nyu.edu;https://www.columbia.edu", "aff_unique_abbr": "UBC;NYU;Columbia", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;1", "aff_country_unique": "Canada;United States" }, { "id": "fIj88Tn3fc", "title": "ReMix: Optimizing Data Mixtures for Large Scale Imitation Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Increasingly large robotics datasets are being collected to train larger foundation models in robotics. However, despite the fact that data selection has been of utmost importance to scaling in vision and natural language processing (NLP), little work in robotics has questioned what data such models should actually be trained on. In this work we investigate how to weigh different subsets or ``domains'' of robotics datasets during pre-training to maximize worst-case performance across all possible downstream domains using distributionally robust optimization (DRO). Unlike in NLP, we find that these methods are hard to apply out of the box due to varying action spaces and dynamics across robots. Our method, ReMix, employs early stopping and action normalization and discretization to counteract these issues. Through extensive experimentation on both the Bridge and OpenX datasets, we demonstrate that data curation can have an outsized impact on downstream performance. Specifically, domain weights learned by ReMix outperform uniform weights by over 40\\% on average and human-selected weights by over 20\\% on datasets used to train the RT-X models.", "keywords": "Data Curation;Data Quality;Robot Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/37287fc82d9dd25944e1776d48d8a5752f0da03a.zip", "author": "Joey Hejna;Chethan Anand Bhateja;Yichen Jiang;Karl Pertsch;Dorsa Sadigh", "authorids": "~Joey_Hejna1;~Chethan_Anand_Bhateja1;ycjiang@stanford.edu;~Karl_Pertsch1;~Dorsa_Sadigh1", "gender": ";M;;;F", "homepage": ";;;https://kpertsch.github.io/;https://dorsa.fyi/", "dblp": ";;;211/7137;117/3174", "google_scholar": ";;;https://scholar.google.com/citations?view_op=list_works;ZaJEZpYAAAAJ", "orcid": ";;;;", "linkedin": ";chetbhateja;;;", "or_profile": "~Joey_Hejna1;~Chethan_Anand_Bhateja1;ycjiang@stanford.edu;~Karl_Pertsch1;~Dorsa_Sadigh1", "aff": ";University of California, Berkeley;;Stanford University;Stanford University", "aff_domain": ";berkeley.edu;;stanford.edu;stanford.edu", "position": ";Undergrad student;;Postdoc;Assistant Professor", "bibtex": "@inproceedings{\nhejna2024remix,\ntitle={ReMix: Optimizing Data Mixtures for Large Scale Imitation Learning},\nauthor={Joey Hejna and Chethan Anand Bhateja and Yichen Jiang and Karl Pertsch and Dorsa Sadigh},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fIj88Tn3fc}\n}", "github": "https://github.com/jhejna/remix", "project": "", "reviewers": "zvQ8;Q6JL;aygU", "site": "https://openreview.net/forum?id=fIj88Tn3fc", "pdf_size": 0, "rating": "1;4;4", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14479051956536140759&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "University of California, Berkeley;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.stanford.edu", "aff_unique_abbr": "UC Berkeley;Stanford", "aff_campus_unique_index": "0;1;1", "aff_campus_unique": "Berkeley;Stanford", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "fNBbEgcfwO", "title": "Surgical Robot Transformer (SRT): Imitation Learning for Surgical Tasks", "track": "main", "status": "Poster", "tldr": "", "abstract": "We explore whether surgical manipulation tasks can be learned on the da Vinci robot via imitation learning.\nHowever, the da Vinci system presents unique challenges which hinder straight-forward implementation of imitation learning. Notably, its forward kinematics is inconsistent due to imprecise joint measurements, and naively training a policy using such approximate kinematics data often leads to task failure. To overcome this limitation, we introduce a relative action formulation \nwhich enables successful policy training and deployment using its approximate kinematics data. A promising outcome of this approach is that the large repository of clinical data, which contains approximate kinematics, may be directly utilized for robot learning without further corrections. We demonstrate our findings through successful execution of three fundamental surgical tasks, including tissue manipulation, needle handling, and knot-tying.", "keywords": "Imitation Learning;Manipulation;Medical Robotics", "primary_area": "", "supplementary_material": "/attachment/216db7e885d750343169c5d03cf2e45c1d118b2a.zip", "author": "Ji Woong Kim;Tony Z. Zhao;Samuel Schmidgall;Anton Deguet;Marin Kobilarov;Chelsea Finn;Axel Krieger", "authorids": "~Ji_Woong_Kim2;~Tony_Z._Zhao1;~Samuel_Schmidgall1;anton.deguet@jhu.edu;~Marin_Kobilarov1;~Chelsea_Finn1;~Axel_Krieger1", "gender": "M;;M;;M;F;", "homepage": "https://jkim447.github.io/;https://tonyzhaozh.github.io/;https://github.com/SamuelSchmidgall;;http://asco.lcsr.jhu.edu;https://ai.stanford.edu/~cbfinn/;", "dblp": ";;267/2231;;;131/1783;", "google_scholar": ";;https://scholar.google.com/citations?hl=en;;;vfPE6hgAAAAJ;ezJ0eHAAAAAJ", "orcid": ";;;;;;0000-0001-8169-075X", "linkedin": ";;samuel-schmidgall-288632162/;;;;", "or_profile": "~Ji_Woong_Kim2;~Tony_Z._Zhao1;~Samuel_Schmidgall1;anton.deguet@jhu.edu;~Marin_Kobilarov1;~Chelsea_Finn1;~Axel_Krieger1", "aff": ";Stanford University;Advanced Micro Devices;;Johns Hopkins University;Google;Johns Hopkins University", "aff_domain": ";stanford.edu;amd.com;;jhu.edu;google.com;jhu.edu", "position": ";PhD student;Research Intern;;Associate Professor;Research Scientist;Associate Professor", "bibtex": "@inproceedings{\nkim2024surgical,\ntitle={Surgical Robot Transformer ({SRT}): Imitation Learning for Surgical Tasks},\nauthor={Ji Woong Kim and Tony Z. Zhao and Samuel Schmidgall and Anton Deguet and Marin Kobilarov and Chelsea Finn and Axel Krieger},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fNBbEgcfwO}\n}", "github": "", "project": "", "reviewers": "q2qZ;kUC3;XUen", "site": "https://openreview.net/forum?id=fNBbEgcfwO", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;4;3", "rating_avg": 3.6666666666666665, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 27, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=1383512987325253425&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;2", "aff_unique_norm": "Stanford University;Advanced Micro Devices, Inc.;Johns Hopkins University;Google", "aff_unique_dep": ";;;Google", "aff_unique_url": "https://www.stanford.edu;https://www.amd.com;https://www.jhu.edu;https://www.google.com", "aff_unique_abbr": "Stanford;AMD;JHU;Google", "aff_campus_unique_index": "0;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "fR1rCXjCQX", "title": "Learning Compositional Behaviors from Demonstration and Language", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce Behavior from Language and Demonstration (BLADE), a framework for long-horizon robotic manipulation by integrating imitation learning and model-based planning. BLADE leverages language-annotated demonstrations, extracts abstract action knowledge from large language models (LLMs), and constructs a library of structured, high-level action representations. These representations include preconditions and effects grounded in visual perception for each high-level action, along with corresponding controllers implemented as neural network-based policies. BLADE can recover such structured representations automatically, without manually labeled states or symbolic definitions. BLADE shows significant capabilities in generalizing to novel situations, including novel initial states, external state perturbations, and novel goals. We validate the effectiveness of our approach both in simulation and on real robots with a diverse set of objects with articulated parts, partial observability, and geometric constraints.", "keywords": "Manipulation;Planning Abstractions;Learning from Language", "primary_area": "", "supplementary_material": "/attachment/18369eb8c1d0317551593f4fc6d6dca9c91afb81.zip", "author": "Weiyu Liu;Neil Nie;Ruohan Zhang;Jiayuan Mao;Jiajun Wu", "authorids": "~Weiyu_Liu1;~Neil_Nie1;~Ruohan_Zhang1;~Jiayuan_Mao1;~Jiajun_Wu1", "gender": "M;M;M;F;M", "homepage": "http://weiyuliu.com/;https://www.neilnie.com;https://ai.stanford.edu/~zharu/;http://jiayuanm.com;https://jiajunwu.com", "dblp": "133/0311.html;;;200/8283;117/4768", "google_scholar": "PHi0YEQAAAAJ;-xDeBSYAAAAJ;-bqvNWoAAAAJ;-xaOIZIAAAAJ;2efgcS0AAAAJ", "orcid": ";;;0000-0003-4798-3748;0000-0002-4176-343X", "linkedin": ";;;;jiajunwu/", "or_profile": "~Weiyu_Liu1;~Neil_Nie1;~Ruohan_Zhang1;~Jiayuan_Mao1;~Jiajun_Wu1", "aff": "Stanford University;Stanford University;Stanford University;Massachusetts Institute of Technology;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;mit.edu;stanford.edu", "position": "Postdoc;MS student;Postdoc;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nliu2024learning,\ntitle={Learning Compositional Behaviors from Demonstration and Language},\nauthor={Weiyu Liu and Neil Nie and Ruohan Zhang and Jiayuan Mao and Jiajun Wu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fR1rCXjCQX}\n}", "github": "", "project": "", "reviewers": "cYEP;UF9m;ZoiQ", "site": "https://openreview.net/forum?id=fR1rCXjCQX", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6336552573135259143&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0;1;0", "aff_unique_norm": "Stanford University;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://web.mit.edu", "aff_unique_abbr": "Stanford;MIT", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "fs7ia3FqUM", "title": "Humanoid Parkour Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Parkour is a grand challenge for legged locomotion, even for quadruped robots, requiring active perception and various maneuvers to overcome multiple challenging obstacles. Existing methods for humanoid locomotion either optimize a trajectory for a single parkour track or train a reinforcement learning policy only to walk with a significant amount of motion references. In this work, we propose a framework for learning an end-to-end vision-based whole-body-control parkour policy for humanoid robots that overcomes multiple parkour skills without any motion prior. Using the parkour policy, the humanoid robot can jump on a 0.42m platform, leap over hurdles, 0.8m gaps, and much more. It can also run at 1.8m/s in the wild and walk robustly on different terrains. We test our policy in indoor and outdoor environments to demonstrate that it can autonomously select parkour skills while following the rotation command of the joystick. We override the arm actions and show that this framework can easily transfer to humanoid mobile manipulation tasks. Videos can be found at https://humanoid4parkour.github.io", "keywords": "Humanoid Agile Locomotion;Visuomotor Control;Sim-to-Real Transfer", "primary_area": "", "supplementary_material": "/attachment/a902a0c633c765e158749f1b988a4421be268674.zip", "author": "Ziwen Zhuang;Shenzhe Yao;Hang Zhao", "authorids": "~Ziwen_Zhuang1;~Shenzhe_Yao1;~Hang_Zhao1", "gender": "M;M;M", "homepage": "https://ziwenzhuang.github.io;http://leopoldyao.github.io;http://www.mit.edu/~hangzhao/", "dblp": ";;", "google_scholar": "GE8fpdwAAAAJ;;DmahiOYAAAAJ", "orcid": ";;", "linkedin": "leozhuang;;", "or_profile": "~Ziwen_Zhuang1;~Shenzhe_Yao1;~Hang_Zhao1", "aff": "ShanghaiTech University;ShanghaiTech University;Tsinghua University", "aff_domain": "shanghaitech.edu.cn;shanghaitech.edu.cn;tsinghua.edu.cn", "position": "MS student;Undergrad student;Assistant Professor", "bibtex": "@inproceedings{\nzhuang2024humanoid,\ntitle={Humanoid Parkour Learning},\nauthor={Ziwen Zhuang and Shenzhe Yao and Hang Zhao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=fs7ia3FqUM}\n}", "github": "", "project": "", "reviewers": "XFVA;CnFz;sc8E", "site": "https://openreview.net/forum?id=fs7ia3FqUM", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 4, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5512585831749675287&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1", "aff_unique_norm": "ShanghaiTech University;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "https://www.shanghaitech.edu.cn;https://www.tsinghua.edu.cn", "aff_unique_abbr": "ShanghaiTech;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "gqCQxObVz2", "title": "3D Diffuser Actor: Policy Diffusion with 3D Scene Representations", "track": "main", "status": "Poster", "tldr": "", "abstract": "Diffusion policies are conditional diffusion models that learn robot action distributions conditioned on the robot and environment state. They have recently shown to outperform both deterministic and alternative action distribution learning formulations. 3D robot policies use 3D scene feature representations aggregated from a single or multiple camera views using sensed depth. They have shown to generalize better than their 2D counterparts across camera viewpoints. We unify these two lines of work and present 3D Diffuser Actor , a neural policy equipped with a novel 3D denoising transformer that fuses information from the 3D visual scene, a language instruction and proprioception to predict the noise in noised 3D robot pose trajectories. 3D Diffuser Actor sets a new state-of-the-art on RLBench with an absolute performance gain of 18.1% over the current SOTA on a multi-view setup and an absolute gain of 13.1% on a single-view setup. On the CALVIN benchmark, it improves over the current SOTA by a 9% relative increase. It also learns to control a robot manipulator in the real world from a handful of demonstrations. Through thorough comparisons with the current SOTA policies and ablations of our model, we show 3D Diffuser Actor \u2019s design choices dramatically outperform 2D representations, regression and classification objectives, absolute attentions, and holistic non-tokenized 3D scene embeddings.", "keywords": "Diffusion models;3D representations;manipulation;imitation learning", "primary_area": "", "supplementary_material": "", "author": "Tsung-Wei Ke;Nikolaos Gkanatsios;Katerina Fragkiadaki", "authorids": "~Tsung-Wei_Ke2;~Nikolaos_Gkanatsios1;~Katerina_Fragkiadaki1", "gender": ";M;F", "homepage": "https://twke18.github.io/;https://nickgkan.github.io/;https://www.cs.cmu.edu/~katef/", "dblp": "173/4984;225/5677;21/8780", "google_scholar": "WTEFsHMAAAAJ;https://scholar.google.gr/citations?user=jk7GqOEAAAAJ;FWp7728AAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Tsung-Wei_Ke2;~Nikolaos_Gkanatsios1;~Katerina_Fragkiadaki1", "aff": "Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;cmu.edu;cmu.edu", "position": "Postdoc;Graduate student;Associate Professor", "bibtex": "@inproceedings{\nke2024d,\ntitle={3D Diffuser Actor: Policy Diffusion with 3D Scene Representations},\nauthor={Tsung-Wei Ke and Nikolaos Gkanatsios and Katerina Fragkiadaki},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gqCQxObVz2}\n}", "github": "https://github.com/nickgkan/3d_diffuser_actor", "project": "", "reviewers": "BGu2;45HC;cssK", "site": "https://openreview.net/forum?id=gqCQxObVz2", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 117, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13487403380500805712&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 6, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "gqFIybpsLX", "title": "Avoid Everything: Model-Free Collision Avoidance with Expert-Guided Fine-Tuning", "track": "main", "status": "Poster", "tldr": "", "abstract": "The world is full of clutter. In order to operate effectively in uncontrolled, real world spaces, robots must navigate safely by executing tasks around obstacles while in proximity to hazards. Creating safe movement for robotic manipulators remains a long-standing challenge in robotics, particularly in environments with partial observability. In partially observed settings, classical techniques often fail. Learned end-to-end motion policies can infer correct solutions in these settings, but are as-yet unable to produce reliably safe movement when close to obstacles. In this work, we introduce Avoid Everything, a novel end-to-end system for generating collision-free motion toward a target, even targets close to obstacles. Avoid Everything consists of two parts: 1) Motion Policy Transformer (M$\\pi$Former), a transformer architecture for end-to-end joint space control from point clouds, trained on over 1,000,000 expert trajectories and 2) a fine-tuning procedure we call Refining on Optimized Policy Experts (ROPE), which uses optimization to provide demonstrations of safe behavior in challenging states. With these techniques, we are able to successfully solve over 63% of reaching problems that caused the previous state of the art method to fail, resulting in an overall success rate of over 91\\% in challenging manipulation settings.", "keywords": "Imitation Learning;Robotics;Collision Avoidance;Fine Tuning;Motion Planning", "primary_area": "", "supplementary_material": "/attachment/dbfc7a276ee597d81a547352236e8562caa6e67d.zip", "author": "Adam Fishman;Aaron Walsman;Mohak Bhardwaj;Wentao Yuan;Balakumar Sundaralingam;Byron Boots;Dieter Fox", "authorids": "~Adam_Fishman1;~Aaron_Walsman1;~Mohak_Bhardwaj1;~Wentao_Yuan1;~Balakumar_Sundaralingam1;~Byron_Boots1;~Dieter_Fox1", "gender": "M;M;;M;M;;M", "homepage": "https://fishbotics.com;http://www.aaronwalsman.com;;https://wentaoyuan.github.io;https://balakumar-s.github.io/;;https://homes.cs.washington.edu/~fox/", "dblp": ";159/1658;;225/4795.html;;;f/DieterFox", "google_scholar": "ciayRBYAAAAJ;57k3iWkAAAAJ;;PZZZG6YAAAAJ;https://scholar.google.com/citations?hl=en;;DqXsbPAAAAAJ", "orcid": ";;;0000-0002-3836-8877;;;", "linkedin": "fishmanadam/;;;;;;", "or_profile": "~Adam_Fishman1;~Aaron_Walsman1;~Mohak_Bhardwaj1;~Wentao_Yuan1;~Balakumar_Sundaralingam1;~Byron_Boots1;~Dieter_Fox1", "aff": "University of Washington;Harvard University;;University of Washington, Seattle;NVIDIA;;Department of Computer Science", "aff_domain": "washington.edu;harvard.edu;;uw.edu;nvidia.com;;cs.washington.edu", "position": "PhD student;Postdoc;;PhD student;Research Scientist;;Full Professor", "bibtex": "@inproceedings{\nfishman2024avoid,\ntitle={Avoid Everything: Model-Free Collision Avoidance with Expert-Guided Fine-Tuning},\nauthor={Adam Fishman and Aaron Walsman and Mohak Bhardwaj and Wentao Yuan and Balakumar Sundaralingam and Byron Boots and Dieter Fox},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gqFIybpsLX}\n}", "github": "https://github.com/fishbotics/avoid-everything", "project": "", "reviewers": "xqeu;Ue3E;SUwx", "site": "https://openreview.net/forum?id=gqFIybpsLX", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13287824343899787132&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "University of Washington;Harvard University;NVIDIA;Unknown Institution", "aff_unique_dep": ";;NVIDIA Corporation;Department of Computer Science", "aff_unique_url": "https://www.washington.edu;https://www.harvard.edu;https://www.nvidia.com;", "aff_unique_abbr": "UW;Harvard;NVIDIA;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "gvdXE7ikHI", "title": "ALOHA Unleashed: A Simple Recipe for Robot Dexterity", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent work has shown promising results for learning end-to-end robot policies using imitation learning. In this work we address the question of how far can we push imitation learning for challenging dexterous manipulation tasks. We show that a simple recipe of large scale data collection on the ALOHA 2 platform, combined with expressive models such as Diffusion Policies, can be effective in learning challenging bimanual manipulation tasks involving deformable objects and complex contact rich dynamics. We demonstrate our recipe on 5 challenging real-world and 3 simulated tasks and demonstrate improved performance over state-of-the-art baselines.", "keywords": "Imitation Learning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/dd205fcc0ed6b3d3a3e4b7fff38b459b9590cdcd.zip", "author": "Tony Z. Zhao;Jonathan Tompson;Danny Driess;Pete Florence;Seyed Kamyar Seyed Ghasemipour;Chelsea Finn;Ayzaan Wahid", "authorids": "~Tony_Z._Zhao1;~Jonathan_Tompson1;~Danny_Driess1;~Pete_Florence1;~Seyed_Kamyar_Seyed_Ghasemipour1;~Chelsea_Finn1;~Ayzaan_Wahid1", "gender": ";M;;;M;F;M", "homepage": "https://tonyzhaozh.github.io/;http://jonathantompson.com;https://dannydriess.github.io/;http://www.peteflorence.com/;http://www.cs.utoronto.ca/~kamyar/;https://ai.stanford.edu/~cbfinn/;https://ayzaan.com", "dblp": ";139/0769;;;238/2555;131/1783;", "google_scholar": ";U_Jw8DUAAAAJ;https://scholar.google.de/citations?user=wxnzyjwAAAAJ;;LHvso9QAAAAJ;vfPE6hgAAAAJ;", "orcid": ";;;;;;", "linkedin": ";;;;;;", "or_profile": "~Tony_Z._Zhao1;~Jonathan_Tompson1;~Danny_Driess1;~Pete_Florence1;~Seyed_Kamyar_Seyed_Ghasemipour1;~Chelsea_Finn1;~Ayzaan_Wahid1", "aff": "Stanford University;Google DeepMind;Google;Google;Google DeepMind Robotics;Google;Robotics at Google", "aff_domain": "stanford.edu;google.com;google.com;google.com;google.com;google.com;google.com", "position": "PhD student;Researcher;Researcher;Research Scientist;Student Researcher;Research Scientist;Software Engineer", "bibtex": "@inproceedings{\nzhao2024aloha,\ntitle={{ALOHA} Unleashed: A Simple Recipe for Robot Dexterity},\nauthor={Tony Z. Zhao and Jonathan Tompson and Danny Driess and Pete Florence and Seyed Kamyar Seyed Ghasemipour and Chelsea Finn and Ayzaan Wahid},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=gvdXE7ikHI}\n}", "github": "", "project": "", "reviewers": "zwLY;qSxg;zByN", "site": "https://openreview.net/forum?id=gvdXE7ikHI", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 83, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17396942207811396400&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1;1;1;1;1", "aff_unique_norm": "Stanford University;Google", "aff_unique_dep": ";Google DeepMind", "aff_unique_url": "https://www.stanford.edu;https://deepmind.com", "aff_unique_abbr": "Stanford;DeepMind", "aff_campus_unique_index": "0;2;2;2;2", "aff_campus_unique": "Stanford;;Mountain View", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United States;United Kingdom" }, { "id": "hV97HJm7Ag", "title": "Task-Oriented Hierarchical Object Decomposition for Visuomotor Control", "track": "main", "status": "Poster", "tldr": "", "abstract": "Good pre-trained visual representations could enable robots to learn visuomotor policy efficiently. Still, existing representations take a one-size-fits-all-tasks approach that comes with two important drawbacks: (1) Being completely task-agnostic, these representations cannot effectively ignore any task-irrelevant information in the scene, and (2) They often lack the representational capacity to handle unconstrained/complex real-world scenes. Instead, we propose to train a large combinatorial family of representations organized by scene entities: objects and object parts. This hierarchical object decomposition for task-oriented representations (HODOR) permits selectively assembling different representations specific to each task while scaling in representational capacity with the complexity of the scene and the task. In our experiments, we find that HODOR outperforms prior pre-trained representations, both scene vector representations and object-centric representations, for sample-efficient imitation learning across 5 simulated and 5 real-world manipulation tasks. We further find that the invariances captured in HODOR are inherited into downstream policies, which can robustly generalize to out-of-distribution test conditions, permitting zero-shot skill chaining. Appendix and videos: https://sites.google.com/view/\nhodor-corl24", "keywords": "Visual Representations;Entities;Imitation;Manipulation", "primary_area": "", "supplementary_material": "/attachment/93c59a938fb52831b0c036ae88add8584cbc74a9.zip", "author": "Jianing Qian;Yunshuang Li;Bernadette Bucher;Dinesh Jayaraman", "authorids": "~Jianing_Qian2;~Yunshuang_Li1;~Bernadette_Bucher1;~Dinesh_Jayaraman2", "gender": "F;F;F;M", "homepage": ";https://li-yunshuang.github.io/;http://bernadettekbucher.com;https://www.seas.upenn.edu/~dineshj/", "dblp": ";;251/5461;145/3870", "google_scholar": "o67NTxYAAAAJ;;VIZvaGsAAAAJ;QxLpghAAAAAJ", "orcid": ";;;0000-0002-6888-3095", "linkedin": ";;bernadette-bucher-09898536/;dinesh-jayaraman-44b31539/", "or_profile": "~Jianing_Qian2;~Yunshuang_Li1;~Bernadette_Bucher1;~Dinesh_Jayaraman2", "aff": "School of Engineering and Applied Science, University of Pennsylvania;University of Pennsylvania;Boston Dynamics AI Institute;University of Pennsylvania", "aff_domain": "seas.upenn.edu;upenn.edu;theaiinstitute.com;upenn.edu", "position": "PhD student;MS student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nqian2024taskoriented,\ntitle={Task-Oriented Hierarchical Object Decomposition for Visuomotor Control},\nauthor={Jianing Qian and Yunshuang Li and Bernadette Bucher and Dinesh Jayaraman},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=hV97HJm7Ag}\n}", "github": "", "project": "", "reviewers": "U4ZG;9qHC;cTv2;fKHY", "site": "https://openreview.net/forum?id=hV97HJm7Ag", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "5;4;3;5", "rating_avg": 3.0, "confidence_avg": 4.25, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:V0ESzmiqw_cJ:scholar.google.com/&scioq=Task-Oriented+Hierarchical+Object+Decomposition+for+Visuomotor+Control&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of Pennsylvania;Boston Dynamics AI Institute", "aff_unique_dep": "School of Engineering and Applied Science;AI Institute", "aff_unique_url": "https://www.upenn.edu;https://www.bostondynamics.com/", "aff_unique_abbr": "UPenn;BD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "iZF0FRPgfq", "title": "I Can Tell What I am Doing: Toward Real-World Natural Language Grounding of Robot Experiences", "track": "main", "status": "Poster", "tldr": "", "abstract": "Understanding robot behaviors and experiences through natural language is crucial for developing intelligent and transparent robotic systems. Recent advancement in large language models (LLMs) makes it possible to translate complex, multi-modal robotic experiences into coherent, human-readable narratives. However, grounding real-world robot experiences into natural language is challenging due to many reasons, such as multi-modal nature of data, differing sample rates, and data volume. We introduce RONAR, an LLM-based system that generates natural language narrations from robot experiences, aiding in behavior announcement, failure analysis, and human interaction to recover failure. Evaluated across various scenarios, RONAR outperforms state-of-the-art methods and improves failure recovery efficiency. Our contributions include a multi-modal framework for robot experience narration, a comprehensive real-robot dataset, and empirical evidence of RONAR's effectiveness in enhancing user experience in system transparency and failure analysis.", "keywords": "Large Language Model;Explainable AI;Failure Analysis", "primary_area": "", "supplementary_material": "/attachment/85e1a3fe3ca04237f38c31c39ae6eeddd1b91910.zip", "author": "Zihan Wang;Brian Liang;Varad Dhat;Zander Brumbaugh;Nick Walker;Ranjay Krishna;Maya Cakmak", "authorids": "~Zihan_Wang14;~Brian_Liang1;~Varad_Dhat1;~Zander_Brumbaugh1;~Nick_Walker1;~Ranjay_Krishna1;~Maya_Cakmak1", "gender": "M;M;M;M;M;M;F", "homepage": "https://avinwangzh.github.io/;https://brinliang.github.io/;;;https://nickwalker.us;http://ranjaykrishna.com;https://homes.cs.washington.edu/~mcakmak/", "dblp": ";;;;14/1613-1;167/3785;65/6092", "google_scholar": ";;GfGwy48AAAAJ;;JYaJjE8AAAAJ;IcqahyAAAAAJ;https://scholar.google.com.tw/citations?user=sPlonWcAAAAJ", "orcid": ";;0009-0006-5427-1326;0000-0001-9190-6227;0000-0001-7711-0003;0000-0001-8784-2531;", "linkedin": ";;;zanderbrumbaugh/;;ranjay-krishna-1a344444/;", "or_profile": "~Zihan_Wang14;~Brian_Liang1;~Varad_Dhat1;~Zander_Brumbaugh1;~Nick_Walker1;~Ranjay_Krishna1;~Maya_\u00c7akmak1", "aff": "University of Washington;University of Washington;University of Washington;Department of Computer Science;University of Washington;University of Washington;University of Washington, Seattle", "aff_domain": "uw.edu;cs.washington.edu;uw.edu;cs.washington.edu;washington.edu;cs.washington.edu;uw.edu", "position": "PhD student;Undergrad student;MS student;MS student;PhD student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024i,\ntitle={I Can Tell What I am Doing: Toward Real-World Natural Language Grounding of Robot Experiences},\nauthor={Zihan Wang and Brian Liang and Varad Dhat and Zander Brumbaugh and Nick Walker and Ranjay Krishna and Maya Cakmak},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=iZF0FRPgfq}\n}", "github": "", "project": "", "reviewers": "kuGW;6w4H;7Xr6", "site": "https://openreview.net/forum?id=iZF0FRPgfq", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17450665595275853952&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "University of Washington;Unknown Institution", "aff_unique_dep": ";Department of Computer Science", "aff_unique_url": "https://www.washington.edu;", "aff_unique_abbr": "UW;", "aff_campus_unique_index": "1", "aff_campus_unique": ";Seattle", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "itKJ5uu1gW", "title": "Dynamic 3D Gaussian Tracking for Graph-Based Neural Dynamics Modeling", "track": "main", "status": "Poster", "tldr": "", "abstract": "Videos of robots interacting with objects encode rich information about the objects' dynamics. However, existing video prediction approaches typically do not explicitly account for the 3D information from videos, such as robot actions and objects' 3D states, limiting their use in real-world robotic applications. In this work, we introduce a framework to learn object dynamics directly from multi-view RGB videos by explicitly considering the robot's action trajectories and their effects on scene dynamics. We utilize the 3D Gaussian representation of 3D Gaussian Splatting (3DGS) to train a particle-based dynamics model using Graph Neural Networks. This model operates on sparse control particles downsampled from the densely tracked 3D Gaussian reconstructions. By learning the neural dynamics model on offline robot interaction data, our method can predict object motions under varying initial configurations and unseen robot actions. The 3D transformations of Gaussians can be interpolated from the motions of control particles, enabling the rendering of predicted future object states and achieving action-conditioned video prediction. The dynamics model can also be applied to model-based planning frameworks for object manipulation tasks. We conduct experiments on various kinds of deformable materials, including ropes, clothes, and stuffed animals, demonstrating our framework's ability to model complex shapes and dynamics. Our project page is available at \\url{https://gaussian-gbnd.github.io/}.", "keywords": "Dynamics Model;3D Gaussian Splatting;Action-Conditioned Video Prediction;Model-Based Planning", "primary_area": "", "supplementary_material": "/attachment/ccaea26a97dad4b7da3492b27bea75cfe47656b9.zip", "author": "Mingtong Zhang;Kaifeng Zhang;Yunzhu Li", "authorids": "~Mingtong_Zhang1;~Kaifeng_Zhang2;~Yunzhu_Li1", "gender": "M;M;M", "homepage": "https://robo-alex.github.io/;https://kywind.github.io/;https://yunzhuli.github.io/", "dblp": ";133/5217;182/1831", "google_scholar": ";jwkE2lgAAAAJ;WlA92lcAAAAJ", "orcid": ";;", "linkedin": ";;", "or_profile": "~Mingtong_Zhang1;~Kaifeng_Zhang2;~Yunzhu_Li1", "aff": "University of Illinois, Urbana Champaign;University of Illinois Urbana-Champaign;University of Illinois Urbana-Champaign", "aff_domain": "illinois.edu;illinois.edu;illinois.edu", "position": "MS student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024dynamic,\ntitle={Dynamic 3D Gaussian Tracking for Graph-Based Neural Dynamics Modeling},\nauthor={Mingtong Zhang and Kaifeng Zhang and Yunzhu Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=itKJ5uu1gW}\n}", "github": "", "project": "", "reviewers": "9VLH;sKFF;tMai", "site": "https://openreview.net/forum?id=itKJ5uu1gW", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14937112069419299341&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Illinois Urbana-Champaign", "aff_unique_dep": "", "aff_unique_url": "https://illinois.edu", "aff_unique_abbr": "UIUC", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Urbana-Champaign", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "jPkOFAiOzf", "title": "Region-aware Grasp Framework with Normalized Grasp Space for Efficient 6-DoF Grasping", "track": "main", "status": "Poster", "tldr": "", "abstract": "A series of region-based methods succeed in extracting regional features and enhancing grasp detection quality. However, faced with a cluttered scene with potential collision, the definition of the grasp-relevant region stays inconsistent. In this paper, we propose Normalized Grasp Space (NGS) from a novel region-aware viewpoint, unifying the grasp representation within a normalized regional space and benefiting the generalizability of methods. Leveraging the NGS, we find that CNNs are underestimated for 3D feature extraction and 6-DoF grasp detection in clutter scenes and build a highly efficient Region-aware Normalized Grasp Network (RNGNet). Experiments on the public benchmark show that our method achieves significant >20 % performance gains while attaining a real-time inference speed of approximately 50 FPS. Real-world cluttered scene clearance experiments underscore the effectiveness of our method. Further, human-to-robot handover and dynamic object grasping experiments demonstrate the potential of our proposed method for closed-loop grasping in dynamic scenarios.", "keywords": "6-DoF Grasping;RGBD Perception;Normalized Space;Heatmap", "primary_area": "", "supplementary_material": "/attachment/a7f4d16873e3c51b9ad97ca8ee51d74b26e856dd.zip", "author": "Siang Chen;Pengwei Xie;Wei Tang;Dingchang Hu;Yixiang Dai;Guijin Wang", "authorids": "~Siang_Chen1;~Pengwei_Xie1;~Wei_Tang22;~Dingchang_Hu1;daiyx23@mails.tsinghua.edu.cn;~Guijin_Wang1", "gender": "M;;;;;M", "homepage": "https://chenthree.github.io;;;;;http://web.ee.tsinghua.edu.cn/wangguijin/zh_CN/index/2769/list/index.htm", "dblp": ";;;;;37/6836", "google_scholar": "dSR-fYUAAAAJ;;;;;qDjozE4AAAAJ", "orcid": ";;;;;", "linkedin": ";;;;;", "or_profile": "~Siang_Chen1;~Pengwei_Xie1;~Wei_Tang22;~Dingchang_Hu1;daiyx23@mails.tsinghua.edu.cn;~Guijin_Wang1", "aff": "Tsinghua University;;;;;Department of Electronic Engineering, Tsinghua University", "aff_domain": "mail.tsinghua.edu.cn;;;;;tsinghua.edu.cn", "position": "PhD student;;;;;Full Professor", "bibtex": "@inproceedings{\nchen2024regionaware,\ntitle={Region-aware Grasp Framework with Normalized Grasp Space for Efficient 6-DoF Grasping},\nauthor={Siang Chen and Pengwei Xie and Wei Tang and Dingchang Hu and Yixiang Dai and Guijin Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jPkOFAiOzf}\n}", "github": "https://github.com/THU-VCLab/RegionNormalizedGrasp", "project": "", "reviewers": "GvsS;yaiB;aAbj", "site": "https://openreview.net/forum?id=jPkOFAiOzf", "pdf_size": 0, "rating": "3;4;4", "confidence": "3;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 6, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14226168013788336808&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0", "aff_unique_norm": "Tsinghua University", "aff_unique_dep": "", "aff_unique_url": "https://www.tsinghua.edu.cn", "aff_unique_abbr": "THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "China" }, { "id": "jart4nhCQr", "title": "Learning to Manipulate Anywhere: A Visual Generalizable Framework For Reinforcement Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Can we endow visuomotor robots with generalization capabilities to operate in diverse open-world scenarios? In this paper, we propose Maniwhere, a generalizable framework tailored for visual reinforcement learning, enabling the trained robot policies to generalize across a combination of multiple visual disturbance types. Specifically, we introduce a multi-view representation learning approach fused with Spatial Transformer Network (STN) module to capture shared semantic information and correspondences among different viewpoints. In addition, we employ a curriculum-based randomization and augmentation approach to stabilize the RL training process and strengthen the visual generalization ability. To exhibit the effectiveness of Maniwhere, we meticulously design **8** tasks encompassing articulate objects, bi-manual, and dexterous hand manipulation tasks, demonstrating Maniwhere's strong visual generalization and sim2real transfer abilities across **3** hardware platforms. Our experiments show that Maniwhere significantly outperforms existing state-of-the-art methods. Videos are provided at https://maniwhere.github.io.", "keywords": "Visual Generalization;Sim2real;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/fca58ed42365c5dcca6cff0edc948e4db41fdf09.zip", "author": "Zhecheng Yuan;Tianming Wei;Shuiqi Cheng;Gu Zhang;Yuanpei Chen;Huazhe Xu", "authorids": "~Zhecheng_Yuan1;~Tianming_Wei1;~Shuiqi_Cheng2;~Gu_Zhang1;~Yuanpei_Chen2;~Huazhe_Xu1", "gender": "M;M;;M;M;M", "homepage": "http://www.github.com;https://github.com/Stillwtm;https://github.com/shuiqicheng;https://www.gu-zhang.com/;https://cypypccpy.github.io/;http://hxu.rocks", "dblp": "314/5755;158/4609;;;1234567;164/9006", "google_scholar": ";5rQ6wdQAAAAJ;;ctFTmmgAAAAJ;https://scholar.google.com/citations?hl=en;t9HPFawAAAAJ", "orcid": ";;;;0000-0002-0033-492X;", "linkedin": ";;;;;", "or_profile": "~Zhecheng_Yuan1;~Tianming_Wei1;~Shuiqi_Cheng2;~Gu_Zhang1;~Yuanpei_Chen2;~Huazhe_Xu1", "aff": ";Shanghai Jiaotong University;University of Hong Kong;Shanghai Jiaotong University;PsiRobot;Tsinghua University", "aff_domain": ";sjtu.edu.cn;hku.hk;sjtu.edu.cn;psibot.ai;tsinghua.edu.cn", "position": ";Undergrad student;Researcher;Undergrad student;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nyuan2024learning,\ntitle={Learning to Manipulate Anywhere: A Visual Generalizable Framework For Reinforcement Learning},\nauthor={Zhecheng Yuan and Tianming Wei and Shuiqi Cheng and Gu Zhang and Yuanpei Chen and Huazhe Xu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jart4nhCQr}\n}", "github": "", "project": "", "reviewers": "vrac;jsuf;RYkn", "site": "https://openreview.net/forum?id=jart4nhCQr", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;4;4", "rating_avg": 4.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 22, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10393458300977260611&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;2;3", "aff_unique_norm": "Shanghai Jiao Tong University;University of Hong Kong;PsiRobot;Tsinghua University", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.hku.hk;;https://www.tsinghua.edu.cn", "aff_unique_abbr": "SJTU;HKU;;THU", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "China;" }, { "id": "jnubz7wB2w", "title": "Verification of Neural Control Barrier Functions with Symbolic Derivative Bounds Propagation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Control barrier functions (CBFs) are important in safety-critical systems and robot control applications. Neural networks have been used to parameterize and synthesize CBFs with bounded control input for complex systems. However, it is still challenging to verify pre-trained neural networks CBFs (neural CBFs) in an efficient symbolic manner. To this end, we propose a new efficient verification framework for ReLU-based neural CBFs through symbolic derivative bound propagation by combining the linearly bounded nonlinear dynamic system and the gradient bounds of neural CBFs. Specifically, with Heaviside step function form for derivatives of activation functions, we show that the symbolic bounds can be propagated through the inner product of neural CBF Jacobian and nonlinear system dynamics. Through extensive experiments on different robot dynamics, our results outperform the interval arithmetic-based baselines in verified rate and verification time along the CBF boundary, validating the effectiveness and efficiency of the proposed method with different model complexity. The code can be found at https://github.com/intelligent-control-lab/verify-neural-CBF.", "keywords": "Learning for control;control barrier function;formal verification", "primary_area": "", "supplementary_material": "/attachment/a708cba3424add9bdb18b763484259786bc6a3d8.zip", "author": "Hanjiang Hu;Yujie Yang;Tianhao Wei;Changliu Liu", "authorids": "~Hanjiang_Hu1;~Yujie_Yang1;~Tianhao_Wei1;~Changliu_Liu1", "gender": "M;M;M;F", "homepage": "https://cs.cmu.edu/~hanjianh;https://yangyujie-jack.github.io/;;http://www.cs.cmu.edu/~cliu6/index.html", "dblp": "249/5764;;222/5386;166/3563", "google_scholar": "https://scholar.google.com/citations?hl=en;2T7-s0MAAAAJ;V22j1C0AAAAJ;", "orcid": ";0000-0001-7222-0019;;", "linkedin": "hanjiang-hu-54337b196/;;;", "or_profile": "~Hanjiang_Hu1;~Yujie_Yang1;~Tianhao_Wei1;~Changliu_Liu1", "aff": "School of Computer Science, Carnegie Mellon University;Tsinghua University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cs.cmu.edu;tsinghua.edu.cn;andrew.cmu.edu;cmu.edu", "position": "MS student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nhu2024verification,\ntitle={Verification of Neural Control Barrier Functions with Symbolic Derivative Bounds Propagation},\nauthor={Hanjiang Hu and Yujie Yang and Tianhao Wei and Changliu Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=jnubz7wB2w}\n}", "github": "https://github.com/intelligent-control-lab/verify-neural-CBF", "project": "", "reviewers": "sbF7;mp6J;KbLk", "site": "https://openreview.net/forum?id=jnubz7wB2w", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;3;3", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14037617823691746885&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;0;0", "aff_unique_norm": "Carnegie Mellon University;Tsinghua University", "aff_unique_dep": "School of Computer Science;", "aff_unique_url": "https://www.cmu.edu;https://www.tsinghua.edu.cn", "aff_unique_abbr": "CMU;THU", "aff_campus_unique_index": "0", "aff_campus_unique": "Pittsburgh;", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "United States;China" }, { "id": "k0ogr4dnhG", "title": "ClutterGen: A Cluttered Scene Generator for Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce ClutterGen, a physically compliant simulation scene generator capable of producing highly diverse, cluttered, and stable scenes for robot learning. Generating such scenes is challenging as each object must adhere to physical laws like gravity and collision. As the number of objects increases, finding valid poses becomes more difficult, necessitating significant human engineering effort, which limits the diversity of the scenes. To overcome these challenges, we propose a reinforcement learning method that can be trained with physics-based reward signals provided by the simulator. Our experiments demonstrate that ClutterGen can generate cluttered object layouts with up to ten objects on confined table surfaces. Additionally, our policy design explicitly encourages the diversity of the generated scenes for open-ended generation. Our real-world robot results show that ClutterGen can be directly used for clutter rearrangement and stable placement policy training.", "keywords": "Simulation Scene Generation;Manipulation;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/51bc627a59657d6a185a6e009e60efbce8f74092.zip", "author": "Yinsen Jia;Boyuan Chen", "authorids": "~Yinsen_Jia2;~Boyuan_Chen1", "gender": "M;Not Specified", "homepage": "https://yjia.net/;http://boyuanchen.com/", "dblp": ";193/7174-1", "google_scholar": "https://scholar.google.com/citations?hl=en;5DBpY6EAAAAJ", "orcid": ";", "linkedin": ";boyuan-chen-b30854a0/", "or_profile": "~Yinsen_Jia2;~Boyuan_Chen1", "aff": "Duke University;Duke University", "aff_domain": "duke.edu;duke.edu", "position": "PhD student;Assistant Professor", "bibtex": "@inproceedings{\njia2024cluttergen,\ntitle={ClutterGen: A Cluttered Scene Generator for Robot Learning},\nauthor={Yinsen Jia and Boyuan Chen},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k0ogr4dnhG}\n}", "github": "https://github.com/generalroboticslab/ClutterGen", "project": "", "reviewers": "qZ9R;fQzd;4LKe", "site": "https://openreview.net/forum?id=k0ogr4dnhG", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 2, "corr_rating_confidence": -0.5, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7593250650520472538&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Duke University", "aff_unique_dep": "", "aff_unique_url": "https://www.duke.edu", "aff_unique_abbr": "Duke", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United States" }, { "id": "k4Nnxqcwt8", "title": "Q-SLAM: Quadric Representations for Monocular SLAM", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this paper, we reimagine volumetric representations through the lens of quadrics. We posit that rigid scene components can be effectively decomposed into quadric surfaces. Leveraging this assumption, we reshape the volumetric representations with million of cubes by several quadric planes, which results in more accurate and efficient modeling of 3D scenes in SLAM contexts. First, we use the quadric assumption to rectify noisy depth estimations from RGB inputs. This step significantly improves depth estimation accuracy, and allows us to efficiently sample ray points around quadric planes instead of the entire volume space in previous NeRF-SLAM systems. Second, we introduce a novel quadric-decomposed transformer to aggregate information across quadrics. The quadric semantics are not only explicitly used for depth correction and scene decomposition, but also serve as an implicit supervision signal for the mapping network. Through rigorous experimental evaluation, our method exhibits superior performance over other approaches relying on estimated depth, and achieves comparable accuracy to methods utilizing ground truth depth on both synthetic and real-world datasets.", "keywords": "Neural Radiance Fields;Simultaneous Localization and Mapping", "primary_area": "", "supplementary_material": "/attachment/42421fba54bbc6b8e5bfaa29407401e38019f158.zip", "author": "Chensheng Peng;Chenfeng Xu;Yue Wang;Mingyu Ding;Heng Yang;Masayoshi Tomizuka;Kurt Keutzer;Marco Pavone;Wei Zhan", "authorids": "~Chensheng_Peng1;~Chenfeng_Xu1;~Yue_Wang2;~Mingyu_Ding1;~Heng_Yang4;~Masayoshi_Tomizuka2;~Kurt_Keutzer1;~Marco_Pavone1;~Wei_Zhan2", "gender": ";M;M;M;M;;M;M;", "homepage": ";;https://yuewang.xyz;https://dingmyu.github.io/;https://hankyang.seas.harvard.edu/;;https://people.eecs.berkeley.edu/~keutzer/;https://web.stanford.edu/~pavone/;", "dblp": ";65/1881;33/4822-41;188/5243;83/415-2;;k/KurtKeutzer.html;91/3382-1.html;", "google_scholar": ";RpqvaTUAAAAJ;v-AEFIEAAAAJ;w4yTWwoAAAAJ;GuKEDfixZqsC;;ID9QePIAAAAJ;RhOpyXcAAAAJ;", "orcid": ";0000-0002-4941-6985;;0000-0001-6556-8359;;;0000-0003-3868-8501;;", "linkedin": ";;;dingmyu/;;;kurtkeutzer/;;", "or_profile": "~Chensheng_Peng1;~Chenfeng_Xu1;~Yue_Wang2;~Mingyu_Ding1;~Heng_Yang4;~Masayoshi_Tomizuka2;~Kurt_Keutzer1;~Marco_Pavone1;~Wei_Zhan2", "aff": ";University of California, Berkeley;NVIDIA;University of California, Berkeley;NVIDIA;;University of California, Berkeley;Stanford University;", "aff_domain": ";berkeley.edu;nvidia.com;berkeley.edu;nvidia.com;;berkeley.edu;stanford.edu;", "position": ";PhD student;Researcher;Postdoc;Researcher;;Full Professor;Associate Professor;", "bibtex": "@inproceedings{\npeng2024qslam,\ntitle={Q-{SLAM}: Quadric Representations for Monocular {SLAM}},\nauthor={Chensheng Peng and Chenfeng Xu and Yue Wang and Mingyu Ding and Heng Yang and Masayoshi Tomizuka and Kurt Keutzer and Marco Pavone and Wei Zhan},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=k4Nnxqcwt8}\n}", "github": "https://github.com/PholyPeng/Q-SLAM", "project": "", "reviewers": "V7cK;csiv;Z7x8", "site": "https://openreview.net/forum?id=k4Nnxqcwt8", "pdf_size": 0, "rating": "1;3;4", "confidence": "5;4;3", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": -0.9819805060619659, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11181558613437073750&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;1;0;2", "aff_unique_norm": "University of California, Berkeley;NVIDIA;Stanford University", "aff_unique_dep": ";NVIDIA Corporation;", "aff_unique_url": "https://www.berkeley.edu;https://www.nvidia.com;https://www.stanford.edu", "aff_unique_abbr": "UC Berkeley;NVIDIA;Stanford", "aff_campus_unique_index": "0;0;0;2", "aff_campus_unique": "Berkeley;;Stanford", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "kEZXeaMrkD", "title": "Goal-Reaching Policy Learning from Non-Expert Observations via Effective Subgoal Guidance", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this work, we address the challenging problem of long-horizon goal-reaching policy learning from non-expert, action-free observation data. Unlike fully labeled expert data, our data is more accessible and avoids the costly process of action labeling. Additionally, compared to online learning, which often involves aimless exploration, our data provides useful guidance for more efficient exploration. To achieve our goal, we propose a novel subgoal guidance learning strategy. The motivation behind this strategy is that long-horizon goals offer limited guidance for efficient exploration and accurate state transition. We develop a diffusion strategy-based high-level policy to generate reasonable subgoals as waypoints, preferring states that more easily lead to the final goal. Additionally, we learn state-goal value functions to encourage efficient subgoal reaching. These two components naturally integrate into the off-policy actor-critic framework, enabling efficient goal attainment through informative exploration. We evaluate our method on complex robotic navigation and manipulation tasks, demonstrating a significant performance advantage over existing methods. Our ablation study further shows that our method is robust to observation data with various corruptions.", "keywords": "Goal-Reaching;Long-Horizon;Non-Expert Observation Data", "primary_area": "", "supplementary_material": "/attachment/e0a185786e3f36ce2ae6779d97feb20f5a15bed0.zip", "author": "RenMing Huang;Shaochong Liu;Yunqiang Pei;Peng Wang;Guoqing Wang;Yang Yang;Heng Tao Shen", "authorids": "~RenMing_Huang1;~Shaochong_Liu1;~Yunqiang_Pei1;~Peng_Wang19;~Guoqing_Wang2;~Yang_Yang37;~Heng_Tao_Shen3", "gender": "M;M;M;M;M;M;M", "homepage": "https://github.com/RenMing-Huang;;https://github.com/Simon1059770342;https://wp8619.github.io/;https://faculty.uestc.edu.cn/wangguoqing1/zh_CN/index.htm;http://cfm.uestc.edu.cn/~yangyang/;https://cfm.uestc.edu.cn/~shenht/", "dblp": ";;;95/4442-23.html;17/356-1;;s/HTShen", "google_scholar": ";;;vIr3ICQAAAAJ;https://scholar.google.com/citations?hl=en;;https://scholar.google.com.au/citations?user=krryaDkAAAAJ", "orcid": ";0009-0009-9183-2585;;;;;", "linkedin": ";;;;;;", "or_profile": "~RenMing_Huang1;~Shaochong_Liu1;~Yunqiang_Pei1;~Peng_Wang19;~Guoqing_Wang2;~Yang_Yang37;~Hengtao_Shen1", "aff": "University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;University of Electronic Science and Technology of China;Tongji University", "aff_domain": "uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;uestc.edu.cn;tongji.edu.cn", "position": "MS student;MS student;PhD student;Full Professor;Full Professor;Full Professor;Full Professor", "bibtex": "@inproceedings{\nhuang2024goalreaching,\ntitle={Goal-Reaching Policy Learning from Non-Expert Observations via Effective Subgoal Guidance},\nauthor={RenMing Huang and Shaochong Liu and Yunqiang Pei and Peng Wang and Guoqing Wang and Yang Yang and Heng Tao Shen},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=kEZXeaMrkD}\n}", "github": "https://github.com/RenMing-Huang/EGR-PO", "project": "", "reviewers": "RXHf;qsNz;onEe", "site": "https://openreview.net/forum?id=kEZXeaMrkD", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15852830622895930929&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0;1", "aff_unique_norm": "University of Electronic Science and Technology of China;Tongji University", "aff_unique_dep": ";", "aff_unique_url": "https://www.uestc.edu.cn;https://www.tongji.edu.cn", "aff_unique_abbr": "UESTC;Tongji", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "lKGRPJFPCM", "title": "InterACT: Inter-dependency Aware Action Chunking with Hierarchical Attention Transformers for Bimanual Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present InterACT: Inter-dependency aware Action Chunking with Hierarchical Attention Transformers, a novel imitation learning framework for bimanual manipulation that integrates hierarchical attention to capture inter-dependencies between dual-arm joint states and visual inputs. InterACT consists of a Hierarchical Attention Encoder and a Multi-arm Decoder, both designed to enhance information aggregation and coordination. The encoder processes multi-modal inputs through segment-wise and cross-segment attention mechanisms, while the decoder leverages synchronization blocks to refine individual action predictions, providing the counterpart's prediction as context. Our experiments on a variety of simulated and real-world bimanual manipulation tasks demonstrate that InterACT significantly outperforms existing methods. Detailed ablation studies validate the contributions of key components of our work, including the impact of CLS tokens, cross-segment encoders, and synchronization blocks.", "keywords": "Robotics;Imitation Learning;Bimanual Manipulation", "primary_area": "", "supplementary_material": "/attachment/28294eff7826122805c5292c3bacb402599789a0.zip", "author": "Andrew Choong-Won Lee;Ian Chuang;Ling-Yuan Chen;Iman Soltani", "authorids": "~Andrew_Choong-Won_Lee1;~Ian_Chuang1;~Ling-Yuan_Chen1;~Iman_Soltani1", "gender": "M;M;M;M", "homepage": ";https://ian-chuang.github.io/;;https://soltanilab.engineering.ucdavis.edu/", "dblp": ";;;", "google_scholar": ";;;BJ2teVoAAAAJ", "orcid": ";0000-0002-1983-9848;;0000-0001-9430-1522", "linkedin": "andrewcwlee/;iantc104/;ling-yuan-chen-b7b14a226/;iman-soltani-31a7b281/", "or_profile": "~Andrew_Choong-Won_Lee1;~Ian_Chuang1;~Ling-Yuan_Chen1;~Iman_Soltani1", "aff": "University of California, Davis;University of California, Davis;University of California, Davis;University of California, Davis", "aff_domain": "ucdavis.edu;ucdavis.edu;ucdavis.edu;ucdavis.edu", "position": "PhD student;Undergrad student;MS student;Assistant Professor", "bibtex": "@inproceedings{\nlee2024interact,\ntitle={Inter{ACT}: Inter-dependency Aware Action Chunking with Hierarchical Attention Transformers for Bimanual Manipulation},\nauthor={Andrew Choong-Won Lee and Ian Chuang and Ling-Yuan Chen and Iman Soltani},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lKGRPJFPCM}\n}", "github": "", "project": "", "reviewers": "KWL4;Gcy9;n8FP", "site": "https://openreview.net/forum?id=lKGRPJFPCM", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10463529790546272162&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of California, Davis", "aff_unique_dep": "", "aff_unique_url": "https://www.ucdavis.edu", "aff_unique_abbr": "UC Davis", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Davis", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "lpjPft4RQT", "title": "TRANSIC: Sim-to-Real Policy Transfer by Learning from Online Correction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning in simulation and transferring the learned policy to the real world has the potential to enable generalist robots. The key challenge of this approach is to address simulation-to-reality (sim-to-real) gaps. Previous methods often require domain-specific knowledge *a priori*. We argue that a straightforward way to obtain such knowledge is by asking humans to observe and assist robot policy execution in the real world. The robots can then learn from humans to close various sim-to-real gaps. We propose TRANSIC, a data-driven approach to enable successful sim-to-real transfer based on a human-in-the-loop framework. TRANSIC allows humans to augment simulation policies to overcome various unmodeled sim-to-real gaps holistically through intervention and online correction. Residual policies can be learned from human corrections and integrated with simulation policies for autonomous execution. We show that our approach can achieve successful sim-to-real transfer in complex and contact-rich manipulation tasks such as furniture assembly. Through synergistic integration of policies learned in simulation and from humans, TRANSIC is effective as a holistic approach to addressing various, often coexisting sim-to-real gaps. It displays attractive properties such as scaling with human effort. Videos and code are available at https://transic-robot.github.io/.", "keywords": "Sim-to-Real Transfer;Human-in-the-Loop;Robot Manipulation", "primary_area": "", "supplementary_material": "/attachment/35a55f55af5226d6b231a795b3fa5302fc265497.zip", "author": "Yunfan Jiang;Chen Wang;Ruohan Zhang;Jiajun Wu;Li Fei-Fei", "authorids": "~Yunfan_Jiang1;~Chen_Wang16;~Ruohan_Zhang1;~Jiajun_Wu1;~Li_Fei-Fei1", "gender": "M;M;M;M;F", "homepage": "https://yunfanj.com/;http://www.chenwangjeremy.net/;https://ai.stanford.edu/~zharu/;https://jiajunwu.com;https://profiles.stanford.edu/fei-fei-li", "dblp": "311/5581-1;;;117/4768;79/2528", "google_scholar": "https://scholar.google.com/citations?hl=en;lStkAzsAAAAJ;-bqvNWoAAAAJ;2efgcS0AAAAJ;rDfyQnIAAAAJ", "orcid": ";;;0000-0002-4176-343X;", "linkedin": ";;;jiajunwu/;fei-fei-li-4541247/", "or_profile": "~Yunfan_Jiang1;~Chen_Wang16;~Ruohan_Zhang1;~Jiajun_Wu1;~Li_Fei-Fei1", "aff": "Stanford University;Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "cs.stanford.edu;cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;Postdoc;Assistant Professor;Full Professor", "bibtex": "@inproceedings{\njiang2024transic,\ntitle={{TRANSIC}: Sim-to-Real Policy Transfer by Learning from Online Correction},\nauthor={Yunfan Jiang and Chen Wang and Ruohan Zhang and Jiajun Wu and Li Fei-Fei},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lpjPft4RQT}\n}", "github": "https://github.com/transic-robot/transic", "project": "", "reviewers": "LZdQ;pa2D;5kGt", "site": "https://openreview.net/forum?id=lpjPft4RQT", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 28, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17806804312504023996&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "lt0Yf8Wh5O", "title": "Differentiable Robot Rendering", "track": "main", "status": "Poster", "tldr": "", "abstract": "Vision foundation models trained on massive amounts of visual data have shown unprecedented reasoning and planning skills in open-world settings. A key challenge in applying them to robotic tasks is the modality gap between visual data and action data. We introduce differentiable robot rendering, a method allowing the visual appearance of a robot body to be directly differentiable with respect to its control parameters. Our model integrates a kinematics-aware deformable model and Gaussians Splatting and is compatible with any robot form factors and degrees of freedom. We demonstrate its capability and usage in applications including reconstruction of robot poses from images and controlling robots through vision language models. Quantitative and qualitative results show that our differentiable rendering model provides effective gradients for robotic control directly from pixels, setting the foundation for the future applications of vision foundation models in robotics.", "keywords": "Robot Representation;Visual Foundation Model", "primary_area": "", "supplementary_material": "/attachment/f15657e46e2023ed65e4145d4cbed90b5774ae1c.zip", "author": "Ruoshi Liu;Alper Canberk;Shuran Song;Carl Vondrick", "authorids": "~Ruoshi_Liu2;~Alper_Canberk1;~Shuran_Song3;~Carl_Vondrick2", "gender": "M;M;F;M", "homepage": "https://ruoshiliu.github.io/;https://alpercanberk.github.io;https://shurans.github.io/;http://www.cs.columbia.edu/~vondrick/", "dblp": "283/4797;;;26/8610", "google_scholar": "suAawHYAAAAJ;hB1z-gIAAAAJ;https://scholar.google.com/citations?hl=en;3MzhkFIAAAAJ", "orcid": ";;;", "linkedin": "ruoshi-liu-a5046aa0/;;;", "or_profile": "~Ruoshi_Liu2;~Alper_Canberk1;~Shuran_Song3;~Carl_Vondrick2", "aff": "Columbia University;Columbia University;Stanford University;Columbia University", "aff_domain": "columbia.edu;columbia.edu;stanford.edu;columbia.edu", "position": "PhD student;Undergrad student;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nliu2024differentiable,\ntitle={Differentiable Robot Rendering},\nauthor={Ruoshi Liu and Alper Canberk and Shuran Song and Carl Vondrick},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lt0Yf8Wh5O}\n}", "github": "https://github.com/cvlab-columbia/drrobot", "project": "", "reviewers": "FkLZ;VuJm;trpR", "site": "https://openreview.net/forum?id=lt0Yf8Wh5O", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.6666666666666665, "replies_avg": 4, "authors#_avg": 4, "corr_rating_confidence": 0.49999999999999983, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7664753515260284119&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "Columbia University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.columbia.edu;https://www.stanford.edu", "aff_unique_abbr": "Columbia;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "lyhS75loxe", "title": "A3VLM: Actionable Articulation-Aware Vision Language Model", "track": "main", "status": "Poster", "tldr": "", "abstract": "Vision Language Models (VLMs) for robotics have received significant attention in recent years. As a VLM can understand robot observations and perform complex visual reasoning, it is regarded as a potential universal solution for general robotics challenges such as manipulation and navigation. However, previous robotics VLMs such as RT-1, RT-2, and ManipLLM have focused on directly learning robot actions. Such approaches require collecting a significant amount of robot interaction data, which is extremely costly in the real world. Thus, we propose A3VLM, an object-centric, actionable, articulation-aware vision language model. A3VLM focuses on the articulation structure and action affordances of objects. Its representation is robot-agnostic and can be translated into robot actions using simple action primitives. Extensive experiments in both simulation benchmarks and real-world settings demonstrate the effectiveness and stability of A3VLM.", "keywords": "LLM;VLM;Manipulation;Articulation", "primary_area": "", "supplementary_material": "/attachment/889fcc4e31c99faeee423575a206c7feba170a43.zip", "author": "Siyuan Huang;Haonan Chang;Yuhan Liu;Yimeng Zhu;Hao Dong;Abdeslam Boularias;Peng Gao;Hongsheng Li", "authorids": "~Siyuan_Huang4;~Haonan_Chang1;~Yuhan_Liu2;~Yimeng_Zhu1;~Hao_Dong3;~Abdeslam_Boularias1;~Peng_Gao3;~Hongsheng_Li3", "gender": "M;M;M;M;M;M;M;M", "homepage": "https://siyuanhuang95.github.io/;https://github.com/changhaonan;;https://github.com/yimengZhu/;https://zsdonghao.github.io;http://rl.cs.rutgers.edu/;http://www.ee.cuhk.edu.hk/~hsli;", "dblp": "62/885-4.html;;125/8141;;14/1525-3.html;57/2269;27/7402-1;", "google_scholar": "QNkS4KEAAAAJ;;https://scholar.google.com/citations?hl=en;;xLFL4sMAAAAJ;https://scholar.google.com.tw/citations?user=8AF3RCsAAAAJ;BN2Ze-QAAAAJ;miFIAFMAAAAJ", "orcid": "0009-0005-6363-833X;;;;0000-0003-2261-9122;;;", "linkedin": "siyuan-huang-979672149/;;;;;;;", "or_profile": "~Siyuan_Huang4;~Haonan_Chang1;~Yuhan_Liu2;~Yimeng_Zhu1;~Hao_Dong3;~Abdeslam_Boularias1;~Hongsheng_Li3;~Gao_Peng1", "aff": "Shanghai Jiaotong University;Rutgers, New Brunswick;Rutgers University;Yuandao AI;Peking University;, Rutgers University;The Chinese University of Hong Kong;shanghai ai lab ", "aff_domain": "sjtu.edu.cn;scarletmail.rutgers.edu;rutgers.edu;yuandaoai.com;pku.edu.cn;cs.rutgers.edu;cuhk.edu.hk;pjlab.org.cn", "position": "PhD student;PhD student;PhD student;Researcher;Assistant Professor;Associate Professor;Associate Professor;Researcher", "bibtex": "@inproceedings{\nhuang2024avlm,\ntitle={A3{VLM}: Actionable Articulation-Aware Vision Language Model},\nauthor={Siyuan Huang and Haonan Chang and Yuhan Liu and Yimeng Zhu and Hao Dong and Abdeslam Boularias and Peng Gao and Hongsheng Li},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=lyhS75loxe}\n}", "github": "https://github.com/changhaonan/A3VLM", "project": "", "reviewers": "W221;FBJx;tJSN", "site": "https://openreview.net/forum?id=lyhS75loxe", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 12, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16466965761597409106&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2;3;1;4;5", "aff_unique_norm": "Shanghai Jiao Tong University;Rutgers University;Yuandao AI;Peking University;Chinese University of Hong Kong;Shanghai AI Lab", "aff_unique_dep": ";;;;;", "aff_unique_url": "https://www.sjtu.edu.cn;https://www.rutgers.edu;;http://www.pku.edu.cn;https://www.cuhk.edu.hk;https://www.shanghaiailab.com", "aff_unique_abbr": "SJTU;Rutgers;;Peking U;CUHK;Shanghai AI Lab", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";New Brunswick;Hong Kong SAR", "aff_country_unique_index": "0;1;1;0;0;1;0;0", "aff_country_unique": "China;United States" }, { "id": "ma7McOiCZY", "title": "HYPERmotion: Learning Hybrid Behavior Planning for Autonomous Loco-manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Enabling robots to autonomously perform hybrid motions in diverse environments can be beneficial for long-horizon tasks such as material handling, household chores, and work assistance. This requires extensive exploitation of intrinsic motion capabilities, extraction of affordances from rich environmental information, and planning of physical interaction behaviors. Despite recent progress has demonstrated impressive humanoid whole-body control abilities, they struggle to achieve versatility and adaptability for new tasks. In this work, we propose HYPERmotion, a framework that learns, selects and plans behaviors based on tasks in different scenarios. We combine reinforcement learning with whole-body optimization to generate motion for 38 actuated joints and create a motion library to store the learned skills. We apply the planning and reasoning features of the large language models (LLMs) to complex loco-manipulation tasks, constructing a hierarchical task graph that comprises a series of primitive behaviors to bridge lower-level execution with higher-level planning. By leveraging the interaction of distilled spatial geometry and 2D observation with a visual language model (VLM) to ground knowledge into a robotic morphology selector to choose appropriate actions in single- or dual-arm, legged or wheeled locomotion. Experiments in simulation and real-world show that learned motions can efficiently adapt to new tasks, demonstrating high autonomy from free-text commands in unstructured scenes. Videos and website: hy-motion.github.io//", "keywords": "Loco-manipulation;Large Language Models;Humanoid Robot Learning", "primary_area": "", "supplementary_material": "/attachment/ea2905c4b12331fef10f0a56de3eb64e5c8f3d66.zip", "author": "Jin Wang;Rui Dai;Weijie Wang;Luca Rossini;Francesco Ruscelli;Nikos Tsagarakis", "authorids": "~Jin_Wang26;~Rui_Dai7;~Weijie_Wang3;luca.rossini@iit.it;francescoruscelli24@gmail.com;nikos.tsagarakis@iit.it", "gender": "M;M;M;;;", "homepage": ";;;;;", "dblp": ";;;;;", "google_scholar": ";;;;;", "orcid": "0000-0001-7136-2517;;0009-0006-9164-6000;;;", "linkedin": ";rui-dai-b6b012256;;;;", "or_profile": "~Jin_Wang26;~Rui_Dai7;~Weijie_Wang3;luca.rossini@iit.it;francescoruscelli24@gmail.com;nikos.tsagarakis@iit.it", "aff": "Istituto Italiano di Tecnologia;Universit\u00e0 degli Studi di Genova, Istituto Italiano di Tecnologia;Universit\u00e0 degli Studi di Genova, Istituto Italiano di Tecnologia;;;", "aff_domain": "iit.it;iit.it;iit.it;;;", "position": "PhD student;PhD student;PhD student;;;", "bibtex": "@inproceedings{\nwang2024hypermotion,\ntitle={{HYPER}motion: Learning Hybrid Behavior Planning for Autonomous Loco-manipulation},\nauthor={Jin Wang and Rui Dai and Weijie Wang and Luca Rossini and Francesco Ruscelli and Nikos Tsagarakis},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ma7McOiCZY}\n}", "github": "", "project": "", "reviewers": "3ACE;7D92;Ec5Y", "site": "https://openreview.net/forum?id=ma7McOiCZY", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;4", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 6, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14135468702803795788&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1", "aff_unique_norm": "Istituto Italiano di Tecnologia;Universit\u00e0 degli Studi di Genova", "aff_unique_dep": ";", "aff_unique_url": "https://www.iit.it;https://www.unige.it", "aff_unique_abbr": "IIT;UniGe", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Italy" }, { "id": "nQslM6f7dW", "title": "APRICOT: Active Preference Learning and Constraint-Aware Task Planning with LLMs", "track": "main", "status": "Poster", "tldr": "", "abstract": "Home robots performing personalized tasks must adeptly balance user preferences with environmental affordances.\nWe focus on organization tasks within constrained spaces, such as arranging items into a refrigerator, where preferences for placement collide with physical limitations.\nThe robot must infer user preferences based on a small set of demonstrations, which is easier for users to provide than extensively defining all their requirements.\nWhile recent works use Large Language Models (LLMs) to learn preferences from user demonstrations, they encounter two fundamental challenges.\nFirst, there is inherent ambiguity in interpreting user actions, as multiple preferences can often explain a single observed behavior.\nSecond, not all user preferences are practically feasible due to geometric constraints in the environment.\nTo address these challenges, we introduce APRICOT, a novel approach that merges LLM-based Bayesian active preference learning with constraint-aware task planning. \nAPRICOT refines its generated preferences by actively querying the user and dynamically adapts its plan to respect environmental constraints.\nWe evaluate APRICOT on a dataset of diverse organization tasks and demonstrate its effectiveness in real-world scenarios, showing significant improvements in both preference satisfaction and plan feasibility.", "keywords": "Active Preference Learning;Task Planning;Large Language Models", "primary_area": "", "supplementary_material": "/attachment/07c61986734c8fe1d2d20a551e810ad902325576.zip", "author": "Huaxiaoyue Wang;Nathaniel Chin;Gonzalo Gonzalez-Pumariega;Xiangwan Sun;Neha Sunkara;Maximus Adrian Pace;Jeannette Bohg;Sanjiban Choudhury", "authorids": "~Huaxiaoyue_Wang1;~Nathaniel_Chin1;~Gonzalo_Gonzalez-Pumariega1;~Xiangwan_Sun1;~Neha_Sunkara1;~Maximus_Adrian_Pace1;~Jeannette_Bohg1;~Sanjiban_Choudhury3", "gender": "F;M;M;Not Specified;F;M;;M", "homepage": "https://lunay0yuki.github.io/;;https://gonzalogonzalezpumariega.com/;;https://ns597.github.io/cv;https://maxpace1.github.io;https://web.stanford.edu/~bohg/;https://www.sanjibanchoudhury.com/", "dblp": "324/6120;;;;;;52/7377;135/8207", "google_scholar": "yweLdycAAAAJ;;72zQVF8AAAAJ;j8c6XJgAAAAJ;;;rjnJnEkAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": "0009-0008-9239-1745;;0009-0004-5425-7319;;;;0000-0002-4921-7193;", "linkedin": "yukiwang-hw;nathaniel-chin-5b2301195/;gonzalogonzalez2000/;sunny-sun25/;neha-sunkara-34269222b/;maximuspace/;;", "or_profile": "~Huaxiaoyue_Wang1;~Nathaniel_Chin1;~Gonzalo_Gonzalez-Pumariega1;~Xiangwan_Sun1;~Neha_Sunkara1;~Maximus_Adrian_Pace1;~Jeannette_Bohg1;~Sanjiban_Choudhury3", "aff": "Cornell University;Cornell University;Cornell University;Cornell University;Cornell University;Cornell University;Stanford University;Cornell University", "aff_domain": "cornell.edu;cornell.edu;cs.cornell.edu;cornell.edu;cornell.edu;cornell.edu;stanford.edu;cornell.edu", "position": "PhD student;MS student;MS student;Undergrad student;Undergrad student;Undergrad student;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nwang2024apricot,\ntitle={{APRICOT}: Active Preference Learning and Constraint-Aware Task Planning with {LLM}s},\nauthor={Huaxiaoyue Wang and Nathaniel Chin and Gonzalo Gonzalez-Pumariega and Xiangwan Sun and Neha Sunkara and Maximus Adrian Pace and Jeannette Bohg and Sanjiban Choudhury},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nQslM6f7dW}\n}", "github": "https://github.com/portal-cornell/apricot", "project": "", "reviewers": "YWXV;SyaR;gKEt", "site": "https://openreview.net/forum?id=nQslM6f7dW", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11281133758283733781&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0;1;0", "aff_unique_norm": "Cornell University;Stanford University", "aff_unique_dep": ";", "aff_unique_url": "https://www.cornell.edu;https://www.stanford.edu", "aff_unique_abbr": "Cornell;Stanford", "aff_campus_unique_index": "1", "aff_campus_unique": ";Stanford", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "nVJm2RdPDu", "title": "DiffuseLoco: Real-Time Legged Locomotion Control with Diffusion from Offline Datasets", "track": "main", "status": "Poster", "tldr": "", "abstract": "Offline learning at scale has led to breakthroughs in computer vision, natural language processing, and robotic manipulation domains. However, scaling up learning for legged robot locomotion, especially with multiple skills in a single policy, presents significant challenges for prior online reinforcement learning (RL) methods. To address this challenge, we propose DiffuseLoco, a novel, scalable framework that leverages diffusion models to directly learn from offline multimodal datasets with a diverse set of locomotion skills. With design choices tailored for real-time control in dynamical systems, including receding horizon control and delayed inputs, DiffuseLoco is capable of reproducing multimodality in performing various locomotion skills, zero-shot transferred to real quadruped robots and deployed on edge computes. Through extensive real-world benchmarking, DiffuseLoco exhibits better stability and velocity tracking performance compared to prior RL and non-diffusion-based behavior cloning baselines. This work opens new possibilities for scaling up learning-based legged locomotion control through the scaling of large, expressive models and diverse offline datasets.", "keywords": "Offline Learning;Bipedal Walking;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/a35e4368e350efe4d114ef905241f57474cb7348.zip", "author": "Xiaoyu Huang;Yufeng Chi;Ruofeng Wang;Zhongyu Li;Xue Bin Peng;Sophia Shao;Borivoje Nikolic;Koushil Sreenath", "authorids": "~Xiaoyu_Huang1;~Yufeng_Chi1;~Ruofeng_Wang1;~Zhongyu_Li3;~Xue_Bin_Peng1;~Sophia_Shao1;~Borivoje_Nikolic1;~Koushil_Sreenath1", "gender": "M;M;M;M;M;;M;M", "homepage": "https://whoknowsssss.github.io/al-folio/;;https://www.linkedin.com/in/edison-wang-1097a11ab/;;https://xbpeng.github.io;https://people.eecs.berkeley.edu/~ysshao/index.html;https://www2.eecs.berkeley.edu/Faculty/Homepages/nikolic.html/;", "dblp": "26/1782;;;;;133/9941.html;;", "google_scholar": "G-x_szsAAAAJ;https://scholar.google.com/citations?hl=en;;ouSpgSkAAAAJ;https://scholar.google.ca/citations?user=FwxfQosAAAAJ;;https://scholar.google.com.tw/citations?user=QEqPllIAAAAJ;o9aFV8cAAAAJ", "orcid": ";;;;;;;", "linkedin": ";;;;;;;", "or_profile": "~Xiaoyu_Huang1;~Yufeng_Chi1;~Ruofeng_Wang1;~Zhongyu_Li3;~Xue_Bin_Peng1;~Sophia_Shao1;~Borivoje_Nikolic1;~Koushil_Sreenath1", "aff": "University of California, Berkeley;;University of California, Berkeley;University of California, Berkeley;Simon Fraser University;University of California, Berkeley;University of California, Berkeley;University of California, Berkeley", "aff_domain": "berkeley.edu;;berkeley.edu;berkeley.edu;sfu.ca;berkeley.edu;berkeley.edu;berkeley.edu", "position": "PhD student;;MS student;PhD student;Assistant Professor;Assistant Professor;Full Professor;Assistant Professor", "bibtex": "@inproceedings{\nhuang2024diffuseloco,\ntitle={DiffuseLoco: Real-Time Legged Locomotion Control with Diffusion from Offline Datasets},\nauthor={Xiaoyu Huang and Yufeng Chi and Ruofeng Wang and Zhongyu Li and Xue Bin Peng and Sophia Shao and Borivoje Nikolic and Koushil Sreenath},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nVJm2RdPDu}\n}", "github": "https://github.com/HybridRobotics/DiffuseLoco", "project": "", "reviewers": "TkLp;wZGM;8ty4", "site": "https://openreview.net/forum?id=nVJm2RdPDu", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;3;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 29, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14937047581246726201&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;1;0;0;0", "aff_unique_norm": "University of California, Berkeley;Simon Fraser University", "aff_unique_dep": ";", "aff_unique_url": "https://www.berkeley.edu;https://www.sfu.ca", "aff_unique_abbr": "UC Berkeley;SFU", "aff_campus_unique_index": "0;0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;0;0;1;0;0;0", "aff_country_unique": "United States;Canada" }, { "id": "nmEt0ci8hi", "title": "General Flow as Foundation Affordance for Scalable Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We address the challenge of acquiring real-world manipulation skills with a scalable framework. We hold the belief that identifying an appropriate prediction target capable of leveraging large-scale datasets is crucial for achieving efficient and universal learning.\nTherefore, we propose to utilize 3D flow, which represents the future trajectories of 3D points on objects of interest, as an ideal prediction target. \n\nTo exploit scalable data resources, we turn our attention to human videos. We develop, for the first time, a language-conditioned 3D flow prediction model directly from large-scale RGBD human video datasets. Our predicted flow offers actionable guidance, thus facilitating zero-shot skill transfer in real-world scenarios.\nWe deploy our method with a policy based on closed-loop flow prediction. Remarkably, without any in-domain finetuning, our method achieves an impressive 81\\% success rate in zero-shot human-to-robot skill transfer, covering 18 tasks in 6 scenes. \n\nOur framework features the following benefits: (1) scalability: leveraging cross-embodiment data resources; (2) wide application: multiple object categories, including rigid, articulated, and soft bodies;\n(3) stable skill transfer: providing actionable guidance with a small inference domain-gap.", "keywords": "Flow;Transferable Affordance;Scalability", "primary_area": "", "supplementary_material": "/attachment/08239f069172d0f3e5e95ce6d86aacaae573cb2f.zip", "author": "Chengbo Yuan;Chuan Wen;Tong Zhang;Yang Gao", "authorids": "~Chengbo_Yuan2;~Chuan_Wen1;~Tong_Zhang23;~Yang_Gao1", "gender": "M;;M;M", "homepage": "https://alvinwen428.github.io/;https://tongzhangthu.github.io/;http://yang-gao.weebly.com;https://michaelyuancb.github.io/", "dblp": "239/8286;;89/4402-29;", "google_scholar": "G5M9nYwAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.com/citations?hl=en;ehrpcBwAAAAJ", "orcid": ";;;", "linkedin": ";;yang-gao-45245348/;", "or_profile": "~Chuan_Wen1;~Tong_Zhang23;~Yang_Gao1;~Michael_Yuan1", "aff": "University of California, Berkeley;Tsinghua University;Tsinghua University;Wuhan University", "aff_domain": "berkeley.edu;tsinghua.edu.cn;tsinghua.edu.cn;whu.edu.cn", "position": "Intern;PhD student;Assistant Professor;Undergrad student", "bibtex": "@inproceedings{\nyuan2024general,\ntitle={General Flow as Foundation Affordance for Scalable Robot Learning},\nauthor={Chengbo Yuan and Chuan Wen and Tong Zhang and Yang Gao},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=nmEt0ci8hi}\n}", "github": "https://github.com/michaelyuancb/general_flow", "project": "", "reviewers": "Y7U2;6L29;7196", "site": "https://openreview.net/forum?id=nmEt0ci8hi", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 41, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4615048620904467384&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;2", "aff_unique_norm": "University of California, Berkeley;Tsinghua University;Wuhan University", "aff_unique_dep": ";;", "aff_unique_url": "https://www.berkeley.edu;https://www.tsinghua.edu.cn;http://www.whu.edu.cn/", "aff_unique_abbr": "UC Berkeley;THU;WHU", "aff_campus_unique_index": "0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;1;1", "aff_country_unique": "United States;China" }, { "id": "oL1WEZQal8", "title": "OmniH2O: Universal and Dexterous Human-to-Humanoid Whole-Body Teleoperation and Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present OmniH2O (Omni Human-to-Humanoid), a learning-based system for whole-body humanoid teleoperation and autonomy. Using kinematic pose as a universal control interface, OmniH2O enables various ways for a human to control a full-sized humanoid with dexterous hands, including using real-time teleoperation through VR headset, verbal instruction, and RGB camera. OmniH2O also enables full autonomy by learning from teleoperated demonstrations or integrating with frontier models such as GPT-4. OmniH2O demonstrates versatility and dexterity in various real-world whole-body tasks through teleoperation or autonomy, such as playing multiple sports, moving and manipulating objects, and interacting with humans. We develop an RL-based sim-to-real pipeline, which involves large-scale retargeting and augmentation of human motion datasets, learning a real-world deployable policy with sparse sensor input by imitating a privileged teacher policy, and reward designs to enhance robustness and stability. We release the first humanoid whole-body control dataset, OmniH2O-6, containing six everyday tasks, and demonstrate humanoid whole-body skill learning from teleoperated datasets. Videos at the anonymous website [https://anonymous-omni-h2o.github.io/](https://anonymous-omni-h2o.github.io/)", "keywords": "Humanoid Teleoperation;Humanoid Loco-Manipulation;RL", "primary_area": "", "supplementary_material": "/attachment/27d72e18044605ffbf97902f7a2db53f960e8f1f.zip", "author": "Tairan He;Zhengyi Luo;Xialin He;Wenli Xiao;Chong Zhang;Weinan Zhang;Kris M. Kitani;Changliu Liu;Guanya Shi", "authorids": "~Tairan_He1;~Zhengyi_Luo1;~Xialin_He1;~Wenli_Xiao1;~Chong_Zhang6;~Weinan_Zhang1;~Kris_M._Kitani1;~Changliu_Liu1;~Guanya_Shi1", "gender": "M;M;M;M;Not Specified;M;M;F;M", "homepage": "https://tairanhe.com;https://zhengyiluo.github.io/;https://xialin-he.github.io/;https://wenlixiao-cs.github.io/;https://zita-ch.github.io/;http://wnzhang.net;http://www.cs.cmu.edu/~kkitani/;http://www.cs.cmu.edu/~cliu6/index.html;http://guanyashi.github.io", "dblp": "263/2891.html;;;;;28/10261-1;42/163;166/3563;230/4386", "google_scholar": "TVWH2U8AAAAJ;lHPTxGsAAAAJ;-oy5DaIAAAAJ;https://scholar.google.com/citations?hl=en;;Qzss0GEAAAAJ;yv3sH74AAAAJ;;joR1Z4UAAAAJ", "orcid": ";;;;;0000-0002-0127-2425;0000-0002-9389-4060;;0000-0002-9075-3705", "linkedin": "tairan-he-41a904294/;zhengyi-zen-luo-726156105/;;wenli-xiao/;;;;;guanya-shi-b07b43126/", "or_profile": "~Tairan_He1;~Zhengyi_Luo1;~Xialin_He1;~Wenli_Xiao1;~Chong_Zhang6;~Weinan_Zhang1;~Kris_M._Kitani1;~Changliu_Liu1;~Guanya_Shi1", "aff": "Carnegie Mellon University;Meta Platforms, Inc.;Shanghai Jiaotong University;Carnegie Mellon University;ETHZ - ETH Zurich;Shanghai Jiaotong University;Carnegie Mellon University;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "andrew.cmu.edu;meta.com;sjtu.edu.cn;cmu.edu;ethz.ch;sjtu.edu.cn;cmu.edu;cmu.edu;andrew.cmu.edu", "position": "PhD student;Intern;Undergrad student;MS student;MS student;Associate Professor;Associate Professor;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nhe2024omniho,\ntitle={OmniH2O: Universal and Dexterous Human-to-Humanoid Whole-Body Teleoperation and Learning},\nauthor={Tairan He and Zhengyi Luo and Xialin He and Wenli Xiao and Chong Zhang and Weinan Zhang and Kris M. Kitani and Changliu Liu and Guanya Shi},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oL1WEZQal8}\n}", "github": "https://github.com/LeCAR-Lab/human2humanoid", "project": "", "reviewers": "huqu;ziut;2Dtf", "site": "https://openreview.net/forum?id=oL1WEZQal8", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 9, "corr_rating_confidence": 0.0, "gs_citation": 69, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6104129611855990822&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;3;2;0;0;0", "aff_unique_norm": "Carnegie Mellon University;Meta;Shanghai Jiao Tong University;ETH Zurich", "aff_unique_dep": ";Meta Platforms, Inc.;;", "aff_unique_url": "https://www.cmu.edu;https://www.meta.com;https://www.sjtu.edu.cn;https://www.ethz.ch", "aff_unique_abbr": "CMU;Meta;SJTU;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;2;1;0;0;0", "aff_country_unique": "United States;China;Switzerland" }, { "id": "oSU7M7MK6B", "title": "Learning Visuotactile Estimation and Control for Non-prehensile Manipulation under Occlusions", "track": "main", "status": "Poster", "tldr": "", "abstract": "Manipulation without grasping, known as non-prehensile manipulation, is essential for dexterous robots in contact-rich environments, but presents many challenges relating with underactuation, hybrid-dynamics, and frictional uncertainty. Additionally, object occlusions in a scenario of contact uncertainty and where the motion of the object evolves independently from the robot becomes a critical problem, which previous literature fails to address. We present a method for learning visuotactile state estimators and uncertainty-aware control policies for non-prehensile manipulation under occlusions, by leveraging diverse interaction data from privileged policies trained in simulation. We formulate the estimator within a Bayesian deep learning framework, to model its uncertainty, and then train uncertainty-aware control policies by incorporating the pre-learned estimator into the reinforcement learning (RL) loop, both of which lead to significantly improved estimator and policy performance. Therefore, unlike prior non-prehensile research that relies on complex external perception set-ups, our method successfully handles occlusions after sim-to-real transfer to robotic hardware with a simple onboard camera.", "keywords": "State Estimation;Reinforcement Learning with Tactile Sensing;Non-prehensile Manipulation", "primary_area": "", "supplementary_material": "/attachment/3185cc01ca1604415c9bc9caff55d2728c3e2e2e.zip", "author": "Juan Del Aguila Ferrandis;Joao Moura;Sethu Vijayakumar", "authorids": "~Juan_Del_Aguila_Ferrandis1;~Joao_Moura1;sethu.vijayakumar@ed.ac.uk", "gender": "M;M;", "homepage": ";https://sites.google.com/view/joaomoura;", "dblp": ";;", "google_scholar": ";https://scholar.google.co.uk/citations?user=1L5kTRcAAAAJ;", "orcid": ";;", "linkedin": "juan-del-aguila-ferrandis/;joaopousamoura/;", "or_profile": "~Juan_Del_Aguila_Ferrandis1;~Joao_Moura1;sethu.vijayakumar@ed.ac.uk", "aff": ";University of Edinburgh, University of Edinburgh;", "aff_domain": ";ed.ac.uk;", "position": ";Postdoc;", "bibtex": "@inproceedings{\nferrandis2024learning,\ntitle={Learning Visuotactile Estimation and Control for Non-prehensile Manipulation under Occlusions},\nauthor={Juan Del Aguila Ferrandis and Joao Moura and Sethu Vijayakumar},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=oSU7M7MK6B}\n}", "github": "", "project": "", "reviewers": "eXyy;XGFC;TfpZ", "site": "https://openreview.net/forum?id=oSU7M7MK6B", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3332818822214314318&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0", "aff_unique_norm": "University of Edinburgh", "aff_unique_dep": "", "aff_unique_url": "https://www.ed.ac.uk", "aff_unique_abbr": "Edinburgh", "aff_country_unique_index": "0", "aff_country_unique": "United Kingdom" }, { "id": "ovjxugn9Q2", "title": "SoftManiSim: A Fast Simulation Framework for Multi-Segment Continuum Manipulators Tailored for Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper introduces SoftManiSim, a novel simulation framework for multi-segment continuum manipulators. Existing continuum robot simulators often rely on simplifying assumptions, such as constant curvature bending or ignoring contact forces, to meet real-time simulation and training demands. To bridge this gap, we propose a robust and rapid mathematical model for continuum robots at the core of SoftManiSim, ensuring precise and adaptable simulations. The framework can integrate with various rigid-body robots, increasing its utility across different robotic platforms. SoftManiSim supports parallel operations for simultaneous simulations of multiple robots and generates synthetic data essential for training deep reinforcement learning models. This capability enhances the development and optimization of control strategies in dynamic environments. Extensive simulations validate the framework's effectiveness, demonstrating its capabilities in handling complex robotic interactions and tasks. We also present real robot validation to showcase the simulator's practical applicability and accuracy in real-world settings. To our knowledge, SoftManiSim is the first open-source real-time simulator capable of modeling continuum robot behavior under dynamic point/distributed loading. It enables rapid deployment in reinforcement learning and machine learning applications. \nThis simulation framework can be downloaded from https://github.com/MohammadKasaei/SoftManiSim.", "keywords": "Simulation Framework;Soft Robotics;Mathematical Modelling;Robot Learning", "primary_area": "", "supplementary_material": "/attachment/ba445556f94a82d4fb9f2bc59b374f6423a5ae2d.zip", "author": "Mohammadreza Kasaei;Hamidreza Kasaei;Mohsen Khadem", "authorids": "~Mohammadreza_Kasaei1;~Hamidreza_Kasaei1;~Mohsen_Khadem1", "gender": "M;M;", "homepage": "https://mohammadkasaei.github.io/Mohammadreza-Kasaei/;https://www.ai.rug.nl/hkasaei;https://homepages.inf.ed.ac.uk/skhadem/", "dblp": ";;", "google_scholar": "2aY06V4AAAAJ;VFr_XuYAAAAJ;https://scholar.google.co.uk/citations?user=EdlB5Q8AAAAJ", "orcid": ";;", "linkedin": ";hamidreza-kasaei-49b83b57/;", "or_profile": "~Mohammadreza_Kasaei1;~Hamidreza_Kasaei1;~Mohsen_Khadem1", "aff": ";University of Groningen;Edinburgh University, University of Edinburgh", "aff_domain": ";rug.nl;inf.ed.ac.uk", "position": ";Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nkasaei2024softmanisim,\ntitle={SoftManiSim: A Fast Simulation Framework for Multi-Segment Continuum Manipulators Tailored for Robot Learning},\nauthor={Mohammadreza Kasaei and Hamidreza Kasaei and Mohsen Khadem},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ovjxugn9Q2}\n}", "github": "https://github.com/MohammadKasaei/SoftManiSim", "project": "", "reviewers": "osAG;Entu;Vw2X", "site": "https://openreview.net/forum?id=ovjxugn9Q2", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17805217993793708257&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1", "aff_unique_norm": "University of Groningen;University of Edinburgh", "aff_unique_dep": ";", "aff_unique_url": "https://www.rug.nl;https://www.ed.ac.uk", "aff_unique_abbr": "RUG;Edinburgh", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1", "aff_country_unique": "Netherlands;United Kingdom" }, { "id": "p6Wq6TjjHH", "title": "Generative Factor Chaining: Coordinated Manipulation with Diffusion-based Factor Graph", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning to plan for multi-step, multi-manipulator tasks is notoriously difficult because of the large search space and the complex constraint satisfaction problems. We present Generative Factor Chaining (GFC), a composable generative model for planning. GFC represents a planning problem as a spatial-temporal factor graph, where nodes represent objects and robots in the scene, spatial factors capture the distributions of valid relationships among nodes, and temporal factors represent the distributions of skill transitions. Each factor is implemented as a modular diffusion model, which are composed during inference to generate feasible long-horizon plans through bi-directional message passing. We show that GFC can solve complex bimanual manipulation tasks and exhibits strong generalization to unseen planning tasks with novel combinations of objects and constraints. More details can be found at: https://sites.google.com/view/generative-factor-chaining", "keywords": "Task and Motion Planning;Manipulation Planning;Bimanual Manipulation;Generative Models", "primary_area": "", "supplementary_material": "/attachment/c825f127d3ce16cf9a8207c98da7df458c942ac1.zip", "author": "Utkarsh Aashu Mishra;Yongxin Chen;Danfei Xu", "authorids": "~Utkarsh_Aashu_Mishra2;~Yongxin_Chen1;~Danfei_Xu1", "gender": "M;M;M", "homepage": "http://utkarshmishra04.github.io/;https://yongxin.ae.gatech.edu/;https://cs.stanford.edu/~danfei/", "dblp": "274/2706;;135/8443", "google_scholar": "10HbT44AAAAJ;X8BYiV4AAAAJ;J5D4kcoAAAAJ", "orcid": "0000-0002-4977-5187;;", "linkedin": "utkarshamishra/;;", "or_profile": "~Utkarsh_Aashu_Mishra2;~Yongxin_Chen1;~Danfei_Xu1", "aff": "Toyota Research Institute;Georgia Institute of Technology;NVIDIA", "aff_domain": "tri.global;gatech.edu;nvidia.com", "position": "Intern;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nmishra2024generative,\ntitle={Generative Factor Chaining: Coordinated Manipulation with Diffusion-based Factor Graph},\nauthor={Utkarsh Aashu Mishra and Yongxin Chen and Danfei Xu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=p6Wq6TjjHH}\n}", "github": "", "project": "", "reviewers": "tvkY;FWri;dtQm", "site": "https://openreview.net/forum?id=p6Wq6TjjHH", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;5;5", "rating_avg": 3.3333333333333335, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.4999999999999999, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6459113773483343264&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2", "aff_unique_norm": "Toyota Research Institute;Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";;NVIDIA Corporation", "aff_unique_url": "https://www.tri.global;https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "TRI;Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "pPhTsonbXq", "title": "GraspSplats: Efficient Manipulation with 3D Feature Splatting", "track": "main", "status": "Poster", "tldr": "", "abstract": "The ability for robots to perform efficient and zero-shot grasping of object parts is crucial for practical applications and is becoming prevalent with recent advances in Vision-Language Models (VLMs). To bridge the 2D-to-3D gap for representations to support such a capability, existing methods rely on neural fields (NeRFs) via differentiable rendering or point-based projection methods. However, we demonstrate that NeRFs are inappropriate for scene changes due to its implicitness and point-based methods are inaccurate for part localization without rendering-based optimization. To amend these issues, we propose GraspSplats. Using depth supervision and a novel reference feature computation method, GraspSplats can generate high-quality scene representations under 60 seconds. We further validate the advantages of Gaussian-based representation by showing that the explicit and optimized geometry in GraspSplats is sufficient to natively support (1) real-time grasp sampling and (2) dynamic and articulated object manipulation with point trackers.\nWith extensive experiments on a Franka robot, we demonstrate that GraspSplats significantly outperforms existing methods under diverse task settings. In particular, GraspSplats outperforms NeRF-based methods like F3RM and LERF-TOGO, and 2D detection methods. The code will be released.", "keywords": "Zero-shot manipulation;Gaussian Splatting;Keypoint Tracking", "primary_area": "", "supplementary_material": "/attachment/f9f249cb9a5bf09fd470047cee738f0561fc3e77.zip", "author": "Mazeyu Ji;Ri-Zhao Qiu;Xueyan Zou;Xiaolong Wang", "authorids": "~Mazeyu_Ji1;~Ri-Zhao_Qiu1;~Xueyan_Zou1;~Xiaolong_Wang3", "gender": "M;Not Specified;F;M", "homepage": "https://jimazeyu.github.io/;https://rogerqi.github.io/;https://maureenzou.github.io/;https://xiaolonw.github.io/", "dblp": ";336/5470;273/3780;91/952-4", "google_scholar": "j1zLfKwAAAAJ;uH0re54AAAAJ;eslbQqoAAAAJ;Y8O9N_0AAAAJ", "orcid": ";;;", "linkedin": ";rizhaoqiu/;;", "or_profile": "~Mazeyu_Ji1;~Ri-Zhao_Qiu1;~Xueyan_Zou1;~Xiaolong_Wang3", "aff": "University of California, San Diego;University of California, San Diego;University of Wisconsin - Madison;University of California, San Diego", "aff_domain": "ucsd.edu;ucsd.edu;wisc.edu;ucsd.edu", "position": "MS student;PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nji2024graspsplats,\ntitle={GraspSplats: Efficient Manipulation with 3D Feature Splatting},\nauthor={Mazeyu Ji and Ri-Zhao Qiu and Xueyan Zou and Xiaolong Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pPhTsonbXq}\n}", "github": "https://github.com/jimazeyu/GraspSplats", "project": "", "reviewers": "s84c;WC1J;vXka;jdjs", "site": "https://openreview.net/forum?id=pPhTsonbXq", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "3;3;5;4", "rating_avg": 2.75, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 4, "corr_rating_confidence": 0.5222329678670935, "gs_citation": 16, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9749043167414104742&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0", "aff_unique_norm": "University of California, San Diego;University of Wisconsin-Madison", "aff_unique_dep": ";", "aff_unique_url": "https://www.ucsd.edu;https://www.wisc.edu", "aff_unique_abbr": "UCSD;UW-Madison", "aff_campus_unique_index": "0;0;1;0", "aff_campus_unique": "San Diego;Madison", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "pcPSGZFaCH", "title": "Modeling the Real World with High-Density Visual Particle Dynamics", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present High-Density Visual Particle Dynamics (HD-VPD), a learned world model that can emulate the physical dynamics of real scenes by processing massive latent point clouds containing 100K+ particles. To enable efficiency at this scale, we introduce a novel family of Point Cloud Transformers (PCTs) called Interlacers leveraging intertwined linear-attention Performer layers and graph-based neighbour attention layers. We demonstrate the capabilities of HD-VPD by modeling the dynamics of high degree-of-freedom bi-manual robots with two RGB-D cameras. Compared to the previous graph neural network approach, our Interlacer dynamics is twice as fast with the same prediction quality, and can achieve higher quality using 4x as many particles. We illustrate how HD-VPD can evaluate motion plan quality with robotic box pushing and can grasping tasks. See videos and particle dynamics rendered by HD-VPD at https://sites.google.com/view/hd-vpd.", "keywords": "point clouds;particle dynamics;world models for control;Performers", "primary_area": "", "supplementary_material": "/attachment/cbd1b85382eed1ace5a0ddea5e3c71527d757b69.zip", "author": "William F Whitney;Jake Varley;Deepali Jain;Krzysztof Marcin Choromanski;Sumeet Singh;Vikas Sindhwani", "authorids": "~William_F_Whitney1;~Jake_Varley1;~Deepali_Jain1;~Krzysztof_Marcin_Choromanski1;~Sumeet_Singh3;~Vikas_Sindhwani1", "gender": ";M;F;;M;M", "homepage": "http://willwhitney.com;http://www.cs.columbia.edu/~jvarley/;;;;http://vikas.sindhwani.org", "dblp": "160/8671;;84/8010;78/11411;;26/4825", "google_scholar": "aQcYWDMAAAAJ;UJcm1MoAAAAJ;;;ZGpE5cYAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": ";;;;;vikassindhwani", "or_profile": "~William_F_Whitney1;~Jake_Varley1;~Deepali_Jain1;~Krzysztof_Marcin_Choromanski1;~Sumeet_Singh3;~Vikas_Sindhwani1", "aff": "Google DeepMind;Google;Google;Google Brain Robotics & Columbia University;Google Brain Robotics;Google", "aff_domain": "deepmind.com;google.com;google.com;columbia.edu;google.com;google.com", "position": "Researcher;Engineer;Researcher;research scientist & adjunct assistant professor;Researcher;Senior Staff Research Scientist", "bibtex": "@inproceedings{\nwhitney2024modeling,\ntitle={Modeling the Real World with High-Density Visual Particle Dynamics},\nauthor={William F Whitney and Jake Varley and Deepali Jain and Krzysztof Marcin Choromanski and Sumeet Singh and Vikas Sindhwani},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=pcPSGZFaCH}\n}", "github": "", "project": "", "reviewers": "F4kU;L6Jk;FVjU", "site": "https://openreview.net/forum?id=pcPSGZFaCH", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;5;3", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=202312655863397357&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Google", "aff_unique_dep": "Google DeepMind", "aff_unique_url": "https://deepmind.com", "aff_unique_abbr": "DeepMind", "aff_campus_unique_index": "1;1;1;1;1", "aff_campus_unique": ";Mountain View", "aff_country_unique_index": "0;1;1;1;1;1", "aff_country_unique": "United Kingdom;United States" }, { "id": "qUSa3F79am", "title": "Policy Adaptation via Language Optimization: Decomposing Tasks for Few-Shot Imitation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learned language-conditioned robot policies often struggle to effectively adapt to new real-world tasks even when pre-trained across a diverse set of instructions. We propose a novel approach for few-shot adaptation to unseen tasks that exploits the semantic understanding of task decomposition provided by vision-language models (VLMs). Our method, Policy Adaptation via Language Optimization\u00a0(PALO), combines a handful of demonstrations of a task with proposed language decompositions sampled from a VLM to quickly enable rapid nonparametric adaptation, avoiding the need for a larger fine-tuning dataset. We evaluate PALO on extensive real-world experiments consisting of challenging unseen, long-horizon robot manipulation tasks. We find that PALO is able of consistently complete long-horizon, multi-tier tasks in the real world, outperforming state of the art pre-trained generalist policies, and methods that have access to the same demonstrations.", "keywords": "Reinforcement Learning;Vision-Language Models;Manipulation", "primary_area": "", "supplementary_material": "/attachment/e6707b1952488ab2893d76f3ff3bd0ff8565310c.zip", "author": "Vivek Myers;Chunyuan Zheng;Oier Mees;Kuan Fang;Sergey Levine", "authorids": "~Vivek_Myers1;~Chunyuan_Zheng2;~Oier_Mees1;~Kuan_Fang3;~Sergey_Levine1", "gender": ";M;M;;M", "homepage": "https://people.eecs.berkeley.edu/~vmyers/;https://www.wj2003b.github.io;https://www.oiermees.com/;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "270/8694;;190/8659;;80/7594", "google_scholar": "5NGAbT4AAAAJ;N0tYevAAAAAJ;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;;8R35rCwAAAAJ", "orcid": ";;;;", "linkedin": ";bill-zheng-5207991ab/;oier-mees-a3069488;;", "or_profile": "~Vivek_Myers1;~Chunyuan_Zheng2;~Oier_Mees1;~Kuan_Fang3;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;;Google", "aff_domain": "berkeley.edu;berkeley.edu;eecs.berkeley.edu;;google.com", "position": "PhD student;Undergrad student;Postdoc;;Research Scientist", "bibtex": "@inproceedings{\nmyers2024policy,\ntitle={Policy Adaptation via Language Optimization: Decomposing Tasks for Few-Shot Imitation},\nauthor={Vivek Myers and Chunyuan Zheng and Oier Mees and Kuan Fang and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qUSa3F79am}\n}", "github": "https://github.com/vivekmyers/palo-robot", "project": "", "reviewers": "MWui;AzAi;15gE", "site": "https://openreview.net/forum?id=qUSa3F79am", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 13, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=673715574513958205&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "qoebyrnF36", "title": "Control with Patterns: A D-learning Method", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning-based control policies are widely used in various tasks in the field of robotics and control. However, formal (Lyapunov) stability guarantees for learning-based controllers with nonlinear dynamical systems are challenging to obtain.\n\tWe propose a novel control approach, namely Control with Patterns (CWP), to address the stability issue over data sets corresponding to nonlinear dynamical systems.\n\tFor data sets of this kind, we introduce a new definition, namely exponential attraction on data sets, to describe nonlinear dynamical systems under consideration. The problem of exponential attraction on data sets is converted to a pattern classification one based on the data sets and parameterized Lyapunov functions. Furthermore, D-learning is proposed as a method for performing CWP without knowledge of the system dynamics. \n\tFinally, the effectiveness of CWP based on D-learning is demonstrated through simulations and real flight experiments. In these experiments, the position of the multicopter is stabilized using only real-time images as feedback, which can be considered as an Image-Based Visual Servoing (IBVS) problem.", "keywords": "Lyapunov Methods;Reinforcement Learning;Control with Patterns;D-learning;Visual Servoing", "primary_area": "", "supplementary_material": "/attachment/95833e0ac95135b5fe29a8a2f9e5e2e085ad7ed9.zip", "author": "Quan Quan;Kai-Yuan Cai;Chenyu Wang", "authorids": "~Quan_Quan1;~Kai-Yuan_Cai1;~Chenyu_Wang10", "gender": "M;M;M", "homepage": "https://rfly.buaa.edu.cn;;https://github.com/AJingshou", "dblp": ";71/5252.html;", "google_scholar": ";;", "orcid": ";;", "linkedin": ";;", "or_profile": "~Quan_Quan1;~Kai-Yuan_Cai1;~Chenyu_Wang10", "aff": "Beihang University;Beihang University;Beihang University", "aff_domain": "buaa.edu.cn;buaa.edu.cn;buaa.edu.cn", "position": "Full Professor;Full Professor;PhD student", "bibtex": "@inproceedings{\nquan2024control,\ntitle={Control with Patterns: A D-learning Method},\nauthor={Quan Quan and Kai-Yuan Cai and Chenyu Wang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=qoebyrnF36}\n}", "github": "", "project": "", "reviewers": "6cXy;5T6L;FHr9", "site": "https://openreview.net/forum?id=qoebyrnF36", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;2;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:Ed69pKJBrwoJ:scholar.google.com/&scioq=Control+with+Patterns:+A+D-learning+Method&hl=en&as_sdt=0,5", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "Beihang University", "aff_unique_dep": "", "aff_unique_url": "http://www.buaa.edu.cn/", "aff_unique_abbr": "BUAA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "China" }, { "id": "r6ZhiVYriY", "title": "Trust the PRoC3S: Solving Long-Horizon Robotics Problems with LLMs and Constraint Satisfaction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent developments in pretrained large language models (LLMs) applied to robotics have demonstrated their capacity for sequencing a set of discrete skills to achieve open-ended goals in simple robotic tasks. In this paper, we examine the topic of LLM planning for a set of *continuously parameterized* skills whose execution must avoid violations of a set of kinematic, geometric, and physical constraints. We prompt the LLM to output code for a function with open parameters, which, together with environmental constraints, can be viewed as a Continuous Constraint Satisfaction Problem (CCSP). This CCSP can be solved through sampling or optimization to find a skill sequence and continuous parameter settings that achieve the goal while avoiding constraint violations. Additionally, we consider cases where the LLM proposes unsatisfiable CCSPs, such as those that are kinematically infeasible, dynamically unstable, or lead to collisions, and re-prompt the LLM to form a new CCSP accordingly. Experiments across simulated and real-world domains demonstrate that our proposed strategy, \\OursNoSpace, is capable of solving a wide range of complex manipulation tasks with realistic constraints much more efficiently and effectively than existing baselines.", "keywords": "LLMs for planning;task and motion planning;constraint satisfaction", "primary_area": "", "supplementary_material": "/attachment/89f842d5fef3d4807da35c7552ec41cc702dafeb.zip", "author": "Aidan Curtis;Nishanth Kumar;Jing Cao;Tom\u00e1s Lozano-P\u00e9rez;Leslie Pack Kaelbling", "authorids": "~Aidan_Curtis2;~Nishanth_Kumar1;~Jing_Cao3;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "gender": "M;M;F;M;F", "homepage": ";http://nishanthjkumar.com/;;http://people.csail.mit.edu/tlp/;http://people.csail.mit.edu/lpk/", "dblp": ";211/7595;;90/752;k/LesliePackKaelbling", "google_scholar": "tRJf4Q8AAAAJ;FE512o4AAAAJ;;gQOKAggAAAAJ;IcasIiwAAAAJ", "orcid": ";0000-0001-9291-3728;;;0000-0001-6054-7145", "linkedin": ";nishanth-kumar;jingcao26/;;", "or_profile": "~Aidan_Curtis2;~Nishanth_Kumar1;~Jing_Cao3;~Tom\u00e1s_Lozano-P\u00e9rez1;~Leslie_Pack_Kaelbling1", "aff": "Massachusetts Institute of Technology;The AI Institute;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "mit.edu;theaiinstitute.com;mit.edu;mit.edu;mit.edu", "position": "PhD student;Intern;Undergrad student;Full Professor;Full Professor", "bibtex": "@inproceedings{\ncurtis2024trust,\ntitle={Trust the {PR}oC3S: Solving Long-Horizon Robotics Problems with {LLM}s and Constraint Satisfaction},\nauthor={Aidan Curtis and Nishanth Kumar and Jing Cao and Tom{\\'a}s Lozano-P{\\'e}rez and Leslie Pack Kaelbling},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=r6ZhiVYriY}\n}", "github": "https://github.com/Learning-and-Intelligent-Systems/proc3s", "project": "", "reviewers": "XQei;7svD;tB3j", "site": "https://openreview.net/forum?id=r6ZhiVYriY", "pdf_size": 0, "rating": "2;2;3", "confidence": "4;4;4", "rating_avg": 2.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=4389115908196599465&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;0;0;0", "aff_unique_norm": "Massachusetts Institute of Technology;AI Institute", "aff_unique_dep": ";", "aff_unique_url": "https://web.mit.edu;", "aff_unique_abbr": "MIT;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "rEteJcq61j", "title": "Toward General Object-level Mapping from Sparse Views with 3D Diffusion Priors", "track": "main", "status": "Poster", "tldr": "", "abstract": "Object-level mapping builds a 3D map of objects in a scene with detailed shapes and poses from multi-view sensor observations. Conventional methods struggle to build complete shapes and estimate accurate poses due to partial occlusions and sensor noise. They require dense observations to cover all objects, which is challenging to achieve in robotics trajectories. Recent work introduces generative shape priors for object-level mapping from sparse views, but is limited to single-category objects. In this work, we propose a General Object-level Mapping system, GOM, which leverages a 3D diffusion model as shape prior with multi-category support and outputs Neural Radiance Fields (NeRFs) for both texture and geometry for all objects in a scene. \nGOM includes an effective formulation to guide a pre-trained diffusion model with extra nonlinear constraints from sensor measurements without finetuning. We also develop a probabilistic optimization formulation to fuse multi-view sensor observations and diffusion priors for joint 3D object pose and shape estimation. \nOur GOM system demonstrates superior multi-category mapping performance from sparse views, and achieves more accurate mapping results compared to state-of-the-art methods on the real-world benchmarks. \nWe will release our code and model upon publication.", "keywords": "Mapping;Objects Reconstruction;Pose Estimation;Diffusion", "primary_area": "", "supplementary_material": "/attachment/627510437d3c805795cfc92aed2fd72b5e89364f.zip", "author": "Ziwei Liao;Binbin Xu;Steven L. Waslander", "authorids": "~Ziwei_Liao1;~Binbin_Xu1;~Steven_L._Waslander1", "gender": "M;;M", "homepage": ";https://binbin-xu.github.io/;https://trailab.utias.utoronto.ca", "dblp": "250/5212;20/3602-1;18/7142", "google_scholar": "IhfB2iQAAAAJ;https://scholar.google.co.uk/citations?user=874PofoAAAAJ;jY_Bcd8AAAAJ", "orcid": ";;0000-0003-4217-4415", "linkedin": "zwliao/;;", "or_profile": "~Ziwei_Liao1;~Binbin_Xu1;~Steven_Lake_Waslander1", "aff": "University of Toronto;University of Toronto;University of Toronto", "aff_domain": "utoronto.ca;utoronto.ca;utoronto.ca", "position": "PhD student;Postdoc;Full Professor", "bibtex": "@inproceedings{\nliao2024toward,\ntitle={Toward General Object-level Mapping from Sparse Views with 3D Diffusion Priors},\nauthor={Ziwei Liao and Binbin Xu and Steven L. Waslander},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rEteJcq61j}\n}", "github": "https://github.com/TRAILab/GeneralObjectMapping", "project": "", "reviewers": "vp6P;UoFm;3JtG", "site": "https://openreview.net/forum?id=rEteJcq61j", "pdf_size": 0, "rating": "3;4;4", "confidence": "2;3;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10126988477886001542&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Toronto", "aff_unique_dep": "", "aff_unique_url": "https://www.utoronto.ca", "aff_unique_abbr": "U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Canada" }, { "id": "rRpmVq6yHv", "title": "SELFI: Autonomous Self-Improvement with RL for Vision-Based Navigation around People", "track": "main", "status": "Poster", "tldr": "", "abstract": "Autonomous self-improving robots that interact and improve with experience are key to the real-world deployment of robotic systems. In this paper, we propose an online learning method, SELFI, that leverages online robot experience to rapidly fine-tune pre-trained control policies efficiently. SELFI applies online model-free reinforcement learning on top of offline model-based learning to bring out the best parts of both learning paradigms. Specifically, SELFI stabilizes the online learning process by incorporating the same model-based learning objective from offline pre-training into the Q-values learned with online model-free reinforcement learning. We evaluate SELFI in multiple real-world environments and report improvements in terms of collision avoidance, as well as more socially compliant behavior, measured by a human user study. SELFI enables us to quickly learn useful robotic behaviors with less human interventions such as pre-emptive behavior for the pedestrians, collision avoidance for small and transparent objects, and avoiding travel on uneven floor surfaces. We provide supplementary videos to demonstrate the performance of our fine-tuned policy.", "keywords": "online reinforcement learning;vision-based navigation", "primary_area": "", "supplementary_material": "/attachment/a0c83fc92a77565ad579025bafff96cc88c6d7a2.zip", "author": "Noriaki Hirose;Dhruv Shah;Kyle Stachowicz;Ajay Sridhar;Sergey Levine", "authorids": "~Noriaki_Hirose1;~Dhruv_Shah1;~Kyle_Stachowicz1;~Ajay_Sridhar1;~Sergey_Levine1", "gender": "M;M;M;M;M", "homepage": ";http://cs.berkeley.edu/~shah;https://kylesta.ch;https://ajaysridhar.com;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "126/5605;;;;80/7594", "google_scholar": "https://scholar.google.co.jp/citations?user=xvOlfw8AAAAJ;;;https://scholar.google.com/citations?hl=en;8R35rCwAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Noriaki_Hirose1;~Dhruv_Shah1;~Kyle_Stachowicz1;~Ajay_Sridhar1;~Sergey_Levine1", "aff": "Toyota Central R&D Labs., Inc;UC Berkeley;University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "mosk.tytlabs.co.jp;berkeley.edu;berkeley.edu;berkeley.edu;google.com", "position": "Researcher;PhD student;PhD student;Undergrad student;Research Scientist", "bibtex": "@inproceedings{\nhirose2024selfi,\ntitle={{SELFI}: Autonomous Self-Improvement with {RL} for Vision-Based Navigation around People},\nauthor={Noriaki Hirose and Dhruv Shah and Kyle Stachowicz and Ajay Sridhar and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rRpmVq6yHv}\n}", "github": "", "project": "", "reviewers": "1Aui;5y57;agf9;XMR4", "site": "https://openreview.net/forum?id=rRpmVq6yHv", "pdf_size": 0, "rating": "3;3;4;4", "confidence": "5;4;5;5", "rating_avg": 3.5, "confidence_avg": 4.75, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.5773502691896257, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=6689785585648890304&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Toyota Central R&D Labs., Inc;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.toyota-global.com;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Toyota R&D;UC Berkeley;Google", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Japan;United States" }, { "id": "rThtgkXuvZ", "title": "NOD-TAMP: Generalizable Long-Horizon Planning with Neural Object Descriptors", "track": "main", "status": "Poster", "tldr": "", "abstract": "Solving complex manipulation tasks in household and factory settings remains challenging due to long-horizon reasoning, fine-grained interactions, and broad object and scene diversity. Learning skills from demonstrations can be an effective strategy, but such methods often have limited generalizability beyond training data and struggle to solve long-horizon tasks. To overcome this, we propose to synergistically combine two paradigms: Neural Object Descriptors (NODs) that produce generalizable object-centric features and Task and Motion Planning (TAMP) frameworks that chain short-horizon skills to solve multi-step tasks. We introduce NOD-TAMP, a TAMP-based framework that extracts short manipulation trajectories from a handful of human demonstrations, adapts these trajectories using NOD features, and composes them to solve broad long-horizon, contact-rich tasks. NOD-TAMP solves existing manipulation benchmarks with a handful of demonstrations and significantly outperforms prior NOD-based approaches on new tabletop manipulation tasks that require diverse generalization. Finally, we deploy NOD-TAMP on a number of real-world tasks, including tool-use and high-precision insertion. For more details, please visit https://nodtamp.github.io/.", "keywords": "Robot Learning;Robot Planning;Manipulation", "primary_area": "", "supplementary_material": "/attachment/901cdbc3b253f90ebbb02b9ca89523b6f9610146.zip", "author": "Shuo Cheng;Caelan Reed Garrett;Ajay Mandlekar;Danfei Xu", "authorids": "~Shuo_Cheng1;~Caelan_Reed_Garrett1;~Ajay_Mandlekar1;~Danfei_Xu1", "gender": "M;M;M;M", "homepage": "https://sites.google.com/view/shuocheng/home;http://web.mit.edu/caelan/www/;https://ai.stanford.edu/~amandlek/;https://cs.stanford.edu/~danfei/", "dblp": "179/0863;161/9727;https://dblp.uni-trier.de/pers/hd/m/Mandlekar:Ajay;135/8443", "google_scholar": "5CL_0qMAAAAJ;KVUCqGwAAAAJ;MEz23joAAAAJ;J5D4kcoAAAAJ", "orcid": ";0000-0002-6474-1276;;", "linkedin": ";caelan-garrett-85197977/;;", "or_profile": "~Shuo_Cheng1;~Caelan_Reed_Garrett1;~Ajay_Mandlekar1;~Danfei_Xu1", "aff": "Georgia Institute of Technology;NVIDIA;NVIDIA;NVIDIA", "aff_domain": "gatech.edu;nvidia.com;nvidia.com;nvidia.com", "position": "PhD student;Researcher;Researcher;Research Scientist", "bibtex": "@inproceedings{\ncheng2024nodtamp,\ntitle={{NOD}-{TAMP}: Generalizable Long-Horizon Planning with Neural Object Descriptors},\nauthor={Shuo Cheng and Caelan Reed Garrett and Ajay Mandlekar and Danfei Xu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rThtgkXuvZ}\n}", "github": "", "project": "", "reviewers": "3zeS;Stau;s2BX", "site": "https://openreview.net/forum?id=rThtgkXuvZ", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999997, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:38KPC9TLeSwJ:scholar.google.com/&scioq=NOD-TAMP:+Generalizable+Long-Horizon+Planning+with+Neural+Object+Descriptors&hl=en&as_sdt=0,33", "gs_version_total": 3, "aff_unique_index": "0;1;1;1", "aff_unique_norm": "Georgia Institute of Technology;NVIDIA", "aff_unique_dep": ";NVIDIA Corporation", "aff_unique_url": "https://www.gatech.edu;https://www.nvidia.com", "aff_unique_abbr": "Georgia Tech;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "rY5T2aIjPZ", "title": "DeliGrasp: Inferring Object Properties with LLMs for Adaptive Grasp Policies", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large language models (LLMs) can provide rich physical descriptions of most worldly objects, allowing robots to achieve more informed and capable grasping. We leverage LLMs' common sense physical reasoning and code-writing abilities to infer an object's physical characteristics-mass $m$, friction coefficient $\\mu$, and spring constant $k$-from a semantic description, and then translate those characteristics into an executable adaptive grasp policy. Using a two-finger gripper with a built-in depth camera that can control its torque by limiting motor current, we demonstrate that LLM-parameterized but first-principles grasp policies outperform both traditional adaptive grasp policies and direct LLM-as-code policies on a custom benchmark of 12 delicate and deformable items including food, produce, toys, and other everyday items, spanning two orders of magnitude in mass and required pick-up force. We then improve property estimation and grasp performance on variable size objects with model finetuning on property-based comparisons and eliciting such comparisons via chain-of-thought prompting. We also demonstrate how compliance feedback from DeliGrasp policies can aid in downstream tasks such as measuring produce ripeness. Our code and videos are available at: https://deligrasp.github.io", "keywords": "contact-rich manipulation;adaptive grasping;force control;produce manipulation", "primary_area": "", "supplementary_material": "/attachment/c29195e3f38a478e8783685621d1b04b23b97c8f.zip", "author": "William Xie;Maria Valentini;Jensen Lavering;Nikolaus Correll", "authorids": "~William_Xie1;~Maria_Valentini1;~Jensen_Lavering1;~Nikolaus_Correll1", "gender": ";F;;", "homepage": ";https://www.colorado.edu/ics/maria-valentini;;http://correll.cs.colorado.edu", "dblp": ";;;", "google_scholar": ";JZL5_aIAAAAJ;;", "orcid": ";;;", "linkedin": ";maria-valentini-057756139;jensen-lavering-50084a224?utm_source=share&utm_campaign=share_via&utm_content=profile&utm_medium=ios_app;", "or_profile": "~William_Xie1;~Maria_Valentini1;~Jensen_Lavering1;~Nikolaus_Correll1", "aff": ";University of Colorado at Boulder;University of Colorado at Boulder;University of Colorado at Boulder", "aff_domain": ";colorado.edu;colorado.edu;colorado.edu", "position": ";PhD student;Undergrad student;Associate Professor", "bibtex": "@inproceedings{\nxie2024deligrasp,\ntitle={DeliGrasp: Inferring Object Properties with {LLM}s for Adaptive Grasp Policies},\nauthor={William Xie and Maria Valentini and Jensen Lavering and Nikolaus Correll},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rY5T2aIjPZ}\n}", "github": "https://github.com/deligrasp/deligrasp", "project": "", "reviewers": "M14h;JraE;uYAW", "site": "https://openreview.net/forum?id=rY5T2aIjPZ", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;5;3", "rating_avg": 3.3333333333333335, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.9999999999999998, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17526579402706439196&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 2, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Colorado", "aff_unique_dep": "", "aff_unique_url": "https://www.colorado.edu", "aff_unique_abbr": "CU", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Boulder", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "rvKWXxIvj0", "title": "Non-rigid Relative Placement through 3D Dense Diffusion", "track": "main", "status": "Poster", "tldr": "", "abstract": "The task of \"relative placement\" is to predict the placement of one object in relation to another, e.g. placing a mug on a mug rack. Recent methods for relative placement have made tremendous progress towards data-efficient learning for robot manipulation; using explicit object-centric geometric reasoning, these approaches enable generalization to unseen task variations from a small number of demonstrations. State-of-the-art works in this area, however, have yet to represent deformable transformations, despite the ubiquity of non-rigid bodies in real world settings. As a first step towards bridging this gap, we propose \"cross-displacement\" - an extension of the principles of relative placement to geometric relationships between deformable objects - and present a novel vision-based method to learn cross-displacement for a non-rigid task through dense diffusion. To this end, we demonstrate our method's ability to generalize to unseen object instances, out-of-distribution scene configurations, and multimodal goals on a highly deformable cloth-hanging task beyond the scope of prior works.", "keywords": "Deformable;Non-rigid;Manipulation;Relative Placement", "primary_area": "", "supplementary_material": "/attachment/7581c4eca3e8c6623af83bc7eeb71a5bea434aae.zip", "author": "Eric Cai;Octavian Donca;Ben Eisner;David Held", "authorids": "~Eric_Cai1;odonca@andrew.cmu.edu;~Ben_Eisner1;~David_Held1", "gender": ";;M;M", "homepage": ";;;http://davheld.github.io/", "dblp": ";;;22/11147", "google_scholar": ";;RWe-v0UAAAAJ;0QtU-NsAAAAJ", "orcid": ";;;", "linkedin": "eric-cai/;;;", "or_profile": "~Eric_Cai1;odonca@andrew.cmu.edu;~Ben_Eisner1;~David_Held1", "aff": "Carnegie Mellon University;;Carnegie Mellon University;Carnegie Mellon University", "aff_domain": "cmu.edu;;cmu.edu;cmu.edu", "position": "MS student;;PhD student;Associate Professor", "bibtex": "@inproceedings{\ncai2024nonrigid,\ntitle={Non-rigid Relative Placement through 3D Dense Diffusion},\nauthor={Eric Cai and Octavian Donca and Ben Eisner and David Held},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=rvKWXxIvj0}\n}", "github": "", "project": "", "reviewers": "6Ki8;QqMb;3d53", "site": "https://openreview.net/forum?id=rvKWXxIvj0", "pdf_size": 0, "rating": "2;2;3", "confidence": "5;4;3", "rating_avg": 2.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:GxRfQ0RK3LgJ:scholar.google.com/&scioq=Non-rigid+Relative+Placement+through+3D+Dense+Diffusion&hl=en&as_sdt=0,5", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "Carnegie Mellon University", "aff_unique_dep": "", "aff_unique_url": "https://www.cmu.edu", "aff_unique_abbr": "CMU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "s0VNSnPeoA", "title": "Text2Interaction: Establishing Safe and Preferable Human-Robot Interaction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Adjusting robot behavior to human preferences can require intensive human feedback, preventing quick adaptation to new users and changing circumstances. Moreover, current approaches typically treat user preferences as a reward, which requires a manual balance between task success and user satisfaction. To integrate new user preferences in a zero-shot manner, our proposed Text2Interaction framework invokes large language models to generate a task plan, motion preferences as Python code, and parameters of a safety controller. By maximizing the combined probability of task completion and user satisfaction instead of a weighted sum of rewards, we can reliably find plans that fulfill both requirements. We find that 83% of users working with Text2Interaction agree that it integrates their preferences into the plan of the robot, and 94% prefer Text2Interaction over the baseline. Our ablation study shows that Text2Interaction aligns better with unseen preferences than other baselines while maintaining a high success rate. Real-world demonstrations and code are made available at [sites.google.com/view/text2interaction](sites.google.com/view/text2interaction).", "keywords": "Human-Robot Interaction;Human Preference Learning;Task and Motion Planning;Safe Control", "primary_area": "", "supplementary_material": "/attachment/f990f8fd68e717207c76d29b9901bf49ddb94ee0.zip", "author": "Jakob Thumm;Christopher Agia;Marco Pavone;Matthias Althoff", "authorids": "~Jakob_Thumm1;~Christopher_Agia1;~Marco_Pavone1;~Matthias_Althoff1", "gender": "M;M;M;M", "homepage": "https://jakob-thumm.com;https://www.chrisagia.com/;https://web.stanford.edu/~pavone/;https://www.ce.cit.tum.de/cps/members/prof-dr-ing-matthias-althoff/", "dblp": ";268/3555;91/3382-1.html;67/1387", "google_scholar": "https://scholar.google.de/citations?user=sBm3vkcAAAAJ;t8Em5FwAAAAJ;RhOpyXcAAAAJ;https://scholar.google.com.tw/citations?user=E3zazJAAAAAJ", "orcid": "0000-0003-0282-2908;0000-0002-1208-2539;;0000-0003-3733-842X", "linkedin": ";agiachris/;;", "or_profile": "~Jakob_Thumm1;~Christopher_Agia1;~Marco_Pavone1;~Matthias_Althoff1", "aff": "Stanford University;Stanford University;Stanford University;Technische Universit\u00e4t M\u00fcnchen", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;tum.de", "position": "Intern;PhD student;Associate Professor;Associate Professor", "bibtex": "@inproceedings{\nthumm2024textinteraction,\ntitle={Text2Interaction: Establishing Safe and Preferable Human-Robot Interaction},\nauthor={Jakob Thumm and Christopher Agia and Marco Pavone and Matthias Althoff},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s0VNSnPeoA}\n}", "github": "https://github.com/JakobThumm/text2interaction", "project": "", "reviewers": "VQtk;BygU;dJ5U", "site": "https://openreview.net/forum?id=s0VNSnPeoA", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;5;3", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5818457563405214440&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0;1", "aff_unique_norm": "Stanford University;Technische Universit\u00e4t M\u00fcnchen", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tum.de", "aff_unique_abbr": "Stanford;TUM", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "United States;Germany" }, { "id": "s0vHSq5QEv", "title": "Generalizing End-To-End Autonomous Driving In Real-World Environments Using Zero-Shot LLMs", "track": "main", "status": "Poster", "tldr": "", "abstract": "Traditional autonomous driving methods adopt modular design, decomposing tasks into sub-tasks, including perception, prediction, planning, and control. In contrast, end-to-end autonomous driving directly outputs actions from raw sensor data, avoiding error accumulation. However, training an end-to-end model requires a comprehensive dataset. Without adequate data, the end-to-end model exhibits poor generalization capabilities. Recently, large language models (LLMs) have been applied to enhance the generalization property of end-to-end driving models. Most studies explore LLMs in an open-loop manner, where the output actions are compared to those of experts without direct activation in the real world. Other studies in closed-loop settings examine their results in simulated environments. In comparison, this paper proposes an efficient architecture that integrates multimodal LLMs into end-to-end real-world driving models in a closed-loop setting. The LLM periodically takes raw sensor data to generate high-level driving instructions. In our architecture, LLMs can effectively guide the end-to-end model, even at a slower rate than the raw sensor data, because updates aren't needed every time frame. This architecture relaxes the trade-off between the latency and inference quality of the LLM. It also allows us to choose a wide variety of LLMs to improve high-level driving instructions and minimize fine-tuning costs. Consequently, our architecture reduces the data collection requirements because the LLMs do not directly output actions, and we only need to train a simple imitation learning model to output actions. In our experiments, the training data for the end-to-end model in a real-world environment consists of only simple obstacle configurations with one traffic cone, while the test environment is more complex and contains different types of obstacles. Experiments show that the proposed architecture enhances the generalization capabilities of the end-to-end model even without fine-tuning the LLM.", "keywords": "End-to-end Autonomous Driving;Large Vision-Language Model;Generalization", "primary_area": "", "supplementary_material": "/attachment/14b81f9ac412465053abe9250c7dcf63d193456f.zip", "author": "Zeyu Dong;Yimin Zhu;Yansong Li;Kevin Mahon;Yu Sun", "authorids": "~Zeyu_Dong3;~Yimin_Zhu2;~Yansong_Li2;~Kevin_Mahon1;~Yu_Sun22", "gender": "Not Specified;;M;M;F", "homepage": ";https://vwslz.github.io/;https://jackyansongli.github.io/docs/main.html;https://github.com/kevin-mahon;", "dblp": ";;;;", "google_scholar": "https://scholar.google.com/citations?hl=en;;https://scholar.google.com/citations?hl=en;;", "orcid": ";;0009-0008-1800-9608;;", "linkedin": "zeyu-dong-78a445211/;;;kevin-mahon-baa811199/;yu-sun-777a1188", "or_profile": "~Zeyu_Dong3;~Yimin_Zhu2;~Yansong_Li2;~Kevin_Mahon1;~Yu_Sun22", "aff": "State University of New York at Stony Brook;, State University of New York at Stony Brook;University of Illinois Chicago;Sunrise AI Tech;Sunrise Technology Inc.", "aff_domain": "stonybrook.edu;cs.stonybrook.edu;uic.edu;sunriseaitech.com;sunriseaitech.com", "position": "PhD student;PhD student;PhD student;Software Engineer;Researcher", "bibtex": "@inproceedings{\ndong2024generalizing,\ntitle={Generalizing End-To-End Autonomous Driving In Real-World Environments Using Zero-Shot {LLM}s},\nauthor={Zeyu Dong and Yimin Zhu and Yansong Li and Kevin Mahon and Yu Sun},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s0vHSq5QEv}\n}", "github": "", "project": "", "reviewers": "1ZPV;XRbz;Fw6g", "site": "https://openreview.net/forum?id=s0vHSq5QEv", "pdf_size": 0, "rating": "2;2;4", "confidence": "4;4;5", "rating_avg": 2.6666666666666665, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 1.0, "gs_citation": 5, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17582264019074679206&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;2;3", "aff_unique_norm": "State University of New York at Stony Brook;University of Illinois at Chicago;Sunrise AI Tech;Sunrise Technology Inc.", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.stonybrook.edu;https://www.uic.edu;;", "aff_unique_abbr": "SUNY Stony Brook;UIC;;", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Stony Brook;Chicago;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States;" }, { "id": "s31IWg2kN5", "title": "Exploring Under Constraints with Model-Based Actor-Critic and Safety Filters", "track": "main", "status": "Poster", "tldr": "", "abstract": "Applying reinforcement learning (RL) to learn effective policies on physical robots without supervision remains challenging when it comes to tasks where safe exploration is critical. Constrained model-based RL (CMBRL) presents a promising approach to this problem. These methods are designed to learn constraint-adhering policies through constrained optimization approaches. Yet, such policies often fail to meet stringent safety requirements during learning and exploration. Our solution ``CASE'' aims to reduce the instances where constraints are breached during the learning phase. Specifically, CASE integrates techniques for optimizing constrained policies and employs planning-based safety filters as backup policies, effectively lowering constraint violations during learning and making it a more reliable option than other recent constrained model-based policy optimization methods.", "keywords": "Model-based RL;Safe RL;Safety Filter;Exploration", "primary_area": "", "supplementary_material": "/attachment/3a76327920b0715267c764c54afd4ba00b79dc87.zip", "author": "Ahmed Agha;Baris Kayalibay;Atanas Mirchev;Patrick van der Smagt;Justin Bayer", "authorids": "~Ahmed_Agha1;~Baris_Kayalibay1;~Atanas_Mirchev1;~Patrick_van_der_Smagt1;~Justin_Bayer1", "gender": "M;;M;M;M", "homepage": ";;;https://argmax.org;", "dblp": ";194/2562;171/4448.html;24/6573.html;", "google_scholar": "It8RcRIAAAAJ;;;https://scholar.google.de/citations?user=5ybzvbsAAAAJ;https://scholar.google.de/citations?user=kczEEFAAAAAJ", "orcid": ";;0000-0003-2890-5015;0000-0003-4418-4916;", "linkedin": "ahmed-agha-5862b0195/?originalSubdomain=de;;;smagt/;", "or_profile": "~Ahmed_Agha1;~Baris_Kayalibay1;~Atanas_Mirchev1;~Patrick_van_der_Smagt1;~Justin_Bayer1", "aff": "Volkswagen Group;Data Lab, Volkswagen Group;Machine Learning Research Lab, Volkswagen Group;Machine Learning Research Lab; Volkswagen Group;VW Group", "aff_domain": "volkswagen.de;volkswagen.de;argmax.ai;volkswagen.de;volkswagen.de", "position": "Research Assistant;PhD student;PhD student;Full Professor;research scientist", "bibtex": "@inproceedings{\nagha2024exploring,\ntitle={Exploring Under Constraints with Model-Based Actor-Critic and Safety Filters},\nauthor={Ahmed Agha and Baris Kayalibay and Atanas Mirchev and Patrick van der Smagt and Justin Bayer},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=s31IWg2kN5}\n}", "github": "", "project": "", "reviewers": "1NEA;QmYe;Frme", "site": "https://openreview.net/forum?id=s31IWg2kN5", "pdf_size": 0, "rating": "2;3;3", "confidence": "4;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16588924339973043403&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;0;1;0;0", "aff_unique_norm": "Volkswagen Group;Machine Learning Research Lab", "aff_unique_dep": ";Machine Learning Research", "aff_unique_url": "https://www.volkswagenag.com;", "aff_unique_abbr": "VW Group;", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "Germany;" }, { "id": "t0LkF9JnVb", "title": "PianoMime: Learning a Generalist, Dexterous Piano Player from Internet Demonstrations", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this work, we introduce PianoMime, a framework for training a piano-playing agent using internet demonstrations.\nThe internet is a promising source of large-scale demonstrations for training our robot agents. \nIn particular, for the case of piano-playing, Youtube is full of videos of professional pianists playing a wide myriad of songs.\nIn our work, we leverage these demonstrations to learn a generalist piano-playing agent capable of playing any arbitrary song.\nOur framework is divided into three parts: a data preparation phase to extract the informative features from the Youtube videos,\na policy learning phase to train song-specific expert policies from the demonstrations and a policy distillation phase to distil the policies into a single generalist agent.\nWe explore different policy designs to represent the agent and evaluate the influence of the amount of training data on the generalization capability of the agent to novel songs not available in the dataset.\nWe show that we are able to learn a policy with up to 57% F1 score on unseen songs.", "keywords": "Reinforcement Learning;Imitation Learning;Robotics;Dexterous Manipulation", "primary_area": "", "supplementary_material": "/attachment/44e35fccb0c7c2c86043b2b40f7361f0c0abb012.zip", "author": "Cheng Qian;Julen Urain;Kevin Zakka;Jan Peters", "authorids": "~Cheng_Qian8;~Julen_Urain2;~Kevin_Zakka1;~Jan_Peters3", "gender": "M;M;M;M", "homepage": ";https://kzakka.com/;https://www.jan-peters.net;https://thecamusean.github.io/", "dblp": ";;p/JanPeters1;228/9720", "google_scholar": "https://scholar.google.com/citations?hl=en;8qHnRnsAAAAJ;https://scholar.google.de/citations?user=-kIVAcAAAAAJ;lx5qencAAAAJ", "orcid": ";;0000-0002-5266-8091;", "linkedin": ";;janrpeters/;", "or_profile": "~Cheng_Qian8;~Kevin_Zakka1;~Jan_Peters3;~julen_urain1", "aff": "Technische Universit\u00e4t M\u00fcnchen;University of California, Berkeley;TU Darmstadt;Technische Universit\u00e4t Darmstadt", "aff_domain": "tum.de;berkeley.edu;tu-darmstadt.de;tu-darmstadt.de", "position": "MS student;PhD student;Full Professor;Postdoc", "bibtex": "@inproceedings{\nqian2024pianomime,\ntitle={PianoMime: Learning a Generalist, Dexterous Piano Player from Internet Demonstrations},\nauthor={Cheng Qian and Julen Urain and Kevin Zakka and Jan Peters},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=t0LkF9JnVb}\n}", "github": "https://github.com/sNiper-Qian/pianomime", "project": "", "reviewers": "gJnu;CxL8;Hkur", "site": "https://openreview.net/forum?id=t0LkF9JnVb", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;3", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7084112161122562752&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;University of California, Berkeley;Technische Universit\u00e4t Darmstadt", "aff_unique_dep": ";;", "aff_unique_url": "https://www.tum.de;https://www.berkeley.edu;https://www.tu-darmstadt.de", "aff_unique_abbr": "TUM;UC Berkeley;TU Darmstadt", "aff_campus_unique_index": "1;2", "aff_campus_unique": ";Berkeley;Darmstadt", "aff_country_unique_index": "0;1;0;0", "aff_country_unique": "Germany;United States" }, { "id": "tqsQGrmVEu", "title": "View-Invariant Policy Learning via Zero-Shot Novel View Synthesis", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large-scale visuomotor policy learning is a promising approach toward developing generalizable manipulation systems. Yet, policies that can be deployed on diverse embodiments, environments, and observational modalities remain elusive. \nIn this work, we investigate how knowledge from large-scale visual data of the world may be used to address one axis of variation for generalizable manipulation: observational viewpoint. Specifically, we study single-image novel view synthesis models, which learn 3D-aware scene-level priors by rendering images of the same scene from alternate camera viewpoints given a single input image.\nFor practical application to diverse robotic data, these models must operate *zero-shot*, performing view synthesis on unseen tasks and environments. We empirically analyze view synthesis models within a simple data-augmentation scheme that we call View Synthesis Augmentation (VISTA) to understand their capabilities for learning viewpoint-invariant policies from single-viewpoint demonstration data. Upon evaluating the robustness of policies trained with our method to out-of-distribution camera viewpoints, we find that they outperform baselines in both simulated and real-world manipulation tasks.", "keywords": "generalization;visual imitation learning;view synthesis", "primary_area": "", "supplementary_material": "/attachment/20fc8ff578ff6ddd248d5ec31a7b451ab93f060d.zip", "author": "Stephen Tian;Blake Wulfe;Kyle Sargent;Katherine Liu;Sergey Zakharov;Vitor Campagnolo Guizilini;Jiajun Wu", "authorids": "~Stephen_Tian1;~Blake_Wulfe1;~Kyle_Sargent1;~Katherine_Liu1;~Sergey_Zakharov1;~Vitor_Campagnolo_Guizilini2;~Jiajun_Wu1", "gender": "M;;;F;M;M;M", "homepage": "http://s-tian.github.io;;https://kylesargent.github.io;https://thekatherineliu.com;https://zakharos.github.io/;;https://jiajunwu.com", "dblp": "237/9780;;298/0019;226/6398;195/5832;;117/4768", "google_scholar": "l19pn2sAAAAJ;;Lom6iMAAAAAJ;PhpQD2YAAAAJ;https://scholar.google.de/citations?user=3DK3I-8AAAAJ;UH9tP6QAAAAJ;2efgcS0AAAAJ", "orcid": ";;0009-0009-8609-6894;;;;0000-0002-4176-343X", "linkedin": ";;kyle-sargent-784006134/;;;vitorguizilini/;jiajunwu/", "or_profile": "~Stephen_Tian1;~Blake_Wulfe1;~Kyle_Sargent1;~Katherine_Liu1;~Sergey_Zakharov1;~Vitor_Campagnolo_Guizilini2;~Jiajun_Wu1", "aff": "Stanford University;;Computer Science Department, Stanford University;Toyota Research Institute;Toyota Research Institute;Toyota Research Institute;Stanford University", "aff_domain": "stanford.edu;;cs.stanford.edu;tri.global;tri.global;tri.global;stanford.edu", "position": "PhD student;;PhD student;Researcher;Researcher;Staff Research Scientist;Assistant Professor", "bibtex": "@inproceedings{\ntian2024viewinvariant,\ntitle={View-Invariant Policy Learning via Zero-Shot Novel View Synthesis},\nauthor={Stephen Tian and Blake Wulfe and Kyle Sargent and Katherine Liu and Sergey Zakharov and Vitor Campagnolo Guizilini and Jiajun Wu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=tqsQGrmVEu}\n}", "github": "https://github.com/s-tian/VISTA", "project": "", "reviewers": "EcF3;9hYx;Du3J", "site": "https://openreview.net/forum?id=tqsQGrmVEu", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=11312348118354849614&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Stanford University;Toyota Research Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tri.global", "aff_unique_abbr": "Stanford;TRI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "ty1cqzTtUv", "title": "RT-Sketch: Goal-Conditioned Imitation Learning from Hand-Drawn Sketches", "track": "main", "status": "Poster", "tldr": "", "abstract": "Natural language and images are commonly used as goal representations in goal-conditioned imitation learning. However, language can be ambiguous and images can be over-specified. In this work, we study hand-drawn sketches as a modality for goal specification. Sketches can be easy to provide on the fly like language, but like images they can also help a downstream policy to be spatially-aware. By virtue of being minimal, sketches can further help disambiguate task-relevant from irrelevant objects. We present RT-Sketch, a goal-conditioned policy for manipulation that takes a hand-drawn sketch of the desired scene as input, and outputs actions. We train RT-Sketch on a dataset of trajectories paired with synthetically generated goal sketches. We evaluate this approach on six manipulation skills involving tabletop object rearrangements on an articulated countertop. Experimentally we find that RT-Sketch performs comparably to image or language-conditioned agents in straightforward settings, while achieving greater robustness when language goals are ambiguous or visual distractors are present. Additionally, we show that RT-Sketch handles sketches with varied levels of specificity, ranging from minimal line drawings to detailed, colored drawings. For supplementary material and videos, please visit http://rt-sketch.github.io.", "keywords": "Visual Imitation Learning;Goal-Conditioned Manipulation", "primary_area": "", "supplementary_material": "/attachment/9a8a9b17c9de38e1bcf1b4c3c880964abb2c09b1.zip", "author": "Priya Sundaresan;Quan Vuong;Jiayuan Gu;Peng Xu;Ted Xiao;Sean Kirmani;Tianhe Yu;Michael Stark;Ajinkya Jain;Karol Hausman;Dorsa Sadigh;Jeannette Bohg;Stefan Schaal", "authorids": "~Priya_Sundaresan1;~Quan_Vuong2;~Jiayuan_Gu1;~Peng_Xu9;~Ted_Xiao1;~Sean_Kirmani1;~Tianhe_Yu1;~Michael_Stark5;~Ajinkya_Jain1;~Karol_Hausman2;~Dorsa_Sadigh1;~Jeannette_Bohg1;~Stefan_Schaal1", "gender": "F;M;;M;M;M;M;;M;;F;;M", "homepage": ";https://quanvuong.github.io;https://jiayuan-gu.github.io/;;https://www.tedxiao.me;https://kirmani.io/;https://cs.stanford.edu/~tianheyu/;;https://jainajinkya.github.io;;https://dorsa.fyi/;https://web.stanford.edu/~bohg/;http://www-clmc.net", "dblp": ";;210/2429;;198/0598;;192/1797;;166/0292;;117/3174;52/7377;32/3952", "google_scholar": "7SUquR4AAAAJ;NSWI3OwAAAAJ;YH1v2uYAAAAJ;460NWeQAAAAJ;;iyEuK8kAAAAJ;;;CNUnD7kAAAAJ;;ZaJEZpYAAAAJ;rjnJnEkAAAAJ;", "orcid": ";;0000-0002-3207-7921;;;;;;;;;0000-0002-4921-7193;", "linkedin": ";;;;;skirmani;;;;;;;", "or_profile": "~Priya_Sundaresan1;~Quan_Vuong2;~Jiayuan_Gu1;~Peng_Xu9;~Ted_Xiao1;~Sean_Kirmani1;~Tianhe_Yu1;~Michael_Stark5;~Ajinkya_Jain1;~Karol_Hausman2;~Dorsa_Sadigh1;~Jeannette_Bohg1;~Stefan_Schaal1", "aff": "Stanford University;physical intelligence;University of California, San Diego;Google;;Google DeepMind;Google Brain;;Intrinsic Innovation LLC;;Stanford University;Stanford University;", "aff_domain": "stanford.edu;physicalintelligence.company;ucsd.edu;google.com;;google.com;google.com;;intrinsic.ai;;stanford.edu;stanford.edu;", "position": "PhD student;Researcher;PhD student;Researcher;;Researcher;Research Scientist;;Researcher;;Assistant Professor;Assistant Professor;", "bibtex": "@inproceedings{\nsundaresan2024rtsketch,\ntitle={{RT}-Sketch: Goal-Conditioned Imitation Learning from Hand-Drawn Sketches},\nauthor={Priya Sundaresan and Quan Vuong and Jiayuan Gu and Peng Xu and Ted Xiao and Sean Kirmani and Tianhe Yu and Michael Stark and Ajinkya Jain and Karol Hausman and Dorsa Sadigh and Jeannette Bohg and Stefan Schaal},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ty1cqzTtUv}\n}", "github": "", "project": "", "reviewers": "LJYq;C69L;SdeT", "site": "https://openreview.net/forum?id=ty1cqzTtUv", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 13, "corr_rating_confidence": 0.0, "gs_citation": 11, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17943446436668853152&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;3;3;4;0;0", "aff_unique_norm": "Stanford University;Physical Intelligence;University of California, San Diego;Google;Intrinsic Innovation LLC", "aff_unique_dep": ";;;Google;", "aff_unique_url": "https://www.stanford.edu;;https://www.ucsd.edu;https://www.google.com;", "aff_unique_abbr": "Stanford;;UCSD;Google;", "aff_campus_unique_index": "0;2;3;3;0;0", "aff_campus_unique": "Stanford;;San Diego;Mountain View", "aff_country_unique_index": "0;0;0;2;0;0;0;0", "aff_country_unique": "United States;;United Kingdom" }, { "id": "uEbJXWobif", "title": "EXTRACT: Efficient Policy Learning by Extracting Transferable Robot Skills from Offline Data", "track": "main", "status": "Poster", "tldr": "", "abstract": "Most reinforcement learning (RL) methods focus on learning optimal policies over low-level action spaces. While these methods can perform well in their training environments, they lack the flexibility to transfer to new tasks. \nInstead, RL agents that can act over useful, temporally extended skills rather than low-level actions can learn new tasks more easily.\nPrior work in skill-based RL either requires expert supervision to define useful skills, which is hard to scale, or learns a skill-space from offline data with heuristics that limit the adaptability of the skills, making them difficult to transfer during downstream RL.\nOur approach, EXTRACT, instead utilizes pre-trained vision language models to extract a discrete set of semantically meaningful skills from offline data, each of which is parameterized by continuous arguments, without human supervision. \nThis skill parameterization allows robots to learn new tasks by only needing to learn when to select a specific skill and how to modify its arguments for the specific task.\nWe demonstrate through experiments in sparse-reward, image-based, robot manipulation environments that EXTRACT can more quickly learn new tasks than prior works, with major gains in sample efficiency and performance over prior skill-based RL.", "keywords": "reinforcement learning;skill-based reinformement learning;skill learning;transfer learning;foundation models for robotics;robot learning", "primary_area": "", "supplementary_material": "/attachment/5cbb40be809950cf4c5521430c449fc41b52ccc2.zip", "author": "Jesse Zhang;Minho Heo;Zuxin Liu;Erdem Biyik;Joseph J Lim;Yao Liu;Rasool Fakoor", "authorids": "~Jesse_Zhang3;~Minho_Heo1;~Zuxin_Liu1;~Erdem_Biyik1;~Joseph_J_Lim1;~Yao_Liu1;~Rasool_Fakoor1", "gender": "M;M;M;M;M;M;M", "homepage": "https://jessezhang.net;https://minoring.github.io/;https://www.zuxin.me;http://people.eecs.berkeley.edu/~ebiyik/;http://people.csail.mit.edu/lim/;http://yao-liu.com/;http://rasoolfa.github.io", "dblp": ";;227/3137;194/2736;08/3086;64/424-9.html;123/2447", "google_scholar": "fSXCOfEAAAAJ;;5ApCTCoAAAAJ;https://scholar.google.com.tr/citations?user=P-G3sjYAAAAJ;jTnQTBoAAAAJ;umAny5UAAAAJ;nVsOPtQAAAAJ", "orcid": ";;0000-0001-7412-5074;0000-0002-9516-3130;;;", "linkedin": ";;zuxin-liu/;https://linkedin.com/in/ebiyik;;;rasool-fakoor-695b5845/", "or_profile": "~Jesse_Zhang3;~Minho_Heo1;~Zuxin_Liu1;~Erdem_Biyik1;~Joseph_J_Lim1;~Yao_Liu1;~Rasool_Fakoor1", "aff": "NVIDIA;Korea Advanced Institute of Science & Technology;Salesforce AI Research;University of Southern California;Korea Advanced Institute of Science & Technology;Amazon;Amazon Web Services", "aff_domain": "nvidia.com;kaist.ac.kr;salesforce.com;usc.edu;kaist.ac.kr;amazon.com;amazon.com", "position": "Intern;PhD student;Researcher;Assistant Professor;Associate Professor;Researcher;Researcher", "bibtex": "@inproceedings{\nzhang2024extract,\ntitle={{EXTRACT}: Efficient Policy Learning by Extracting Transferable Robot Skills from Offline Data},\nauthor={Jesse Zhang and Minho Heo and Zuxin Liu and Erdem Biyik and Joseph J Lim and Yao Liu and Rasool Fakoor},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uEbJXWobif}\n}", "github": "", "project": "", "reviewers": "Jq78;2VAd;moSg", "site": "https://openreview.net/forum?id=uEbJXWobif", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;3;4", "rating_avg": 3.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=900626335062337405&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;1;4;4", "aff_unique_norm": "NVIDIA;Korea Advanced Institute of Science and Technology;Salesforce;University of Southern California;Amazon", "aff_unique_dep": "NVIDIA Corporation;;Salesforce AI Research;;Amazon.com, Inc.", "aff_unique_url": "https://www.nvidia.com;https://www.kaist.ac.kr;https://www.salesforce.com;https://www.usc.edu;https://www.amazon.com", "aff_unique_abbr": "NVIDIA;KAIST;Salesforce AI;USC;Amazon", "aff_campus_unique_index": "1", "aff_campus_unique": ";Los Angeles", "aff_country_unique_index": "0;1;0;0;1;0;0", "aff_country_unique": "United States;South Korea" }, { "id": "uHdVI3QMr6", "title": "A Dual Approach to Imitation Learning from Observations with Offline Datasets", "track": "main", "status": "Poster", "tldr": "", "abstract": "Demonstrations are an effective alternative to task specification for learning agents in settings where designing a reward function is difficult. However, demonstrating expert behavior in the action space of the agent becomes unwieldy when robots have complex, unintuitive morphologies. We consider the practical setting where an agent has a dataset of prior interactions with the environment and is provided with observation-only expert demonstrations. Typical learning from observations approaches have required either learning an inverse dynamics model or a discriminator as intermediate steps of training. Errors in these intermediate one-step models compound during downstream policy learning or deployment. We overcome these limitations by directly learning a multi-step utility function that quantifies how each action impacts the agent's divergence from the expert's visitation distribution. Using the principle of duality, we derive DILO (Dual Imitation Learning from Observations), an algorithm that can leverage arbitrary suboptimal data to learn imitating policies without requiring expert actions. DILO reduces the learning from observations problem to that of simply learning an actor and a critic, bearing similar complexity to vanilla offline RL. This allows DILO to gracefully scale to high dimensional observations, and demonstrate improved performance across the board.", "keywords": "Learning from Observations;Imitation Learning", "primary_area": "", "supplementary_material": "/attachment/19300e04358cc22fa507ba7409740c27b9d19488.zip", "author": "Harshit Sikchi;Caleb Chuck;Amy Zhang;Scott Niekum", "authorids": "~Harshit_Sikchi1;~Caleb_Chuck1;~Amy_Zhang1;~Scott_Niekum1", "gender": "M;M;M;F", "homepage": "https://hari-sikchi.github.io/;http://calcharles.github.io;https://people.cs.umass.edu/~sniekum/index.php;", "dblp": "271/4663;;62/8399;43/2754", "google_scholar": "jFOPZE0AAAAJ;gELTaB4AAAAJ;4wXYfSUAAAAJ;", "orcid": ";;;", "linkedin": ";;;", "or_profile": "~Harshit_Sikchi1;~Caleb_Chuck1;~Scott_Niekum1;~Amy_Zhang2", "aff": "University of Texas, Austin;University of Texas, Austin;University of Massachusetts at Amherst;Meta Facebook", "aff_domain": "utexas.edu;utexas.edu;umass.edu;facebook.com", "position": "PhD student;PhD student;Associate Professor;Research Scientist", "bibtex": "@inproceedings{\nsikchi2024a,\ntitle={A Dual Approach to Imitation Learning from Observations with Offline Datasets},\nauthor={Harshit Sikchi and Caleb Chuck and Amy Zhang and Scott Niekum},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uHdVI3QMr6}\n}", "github": "https://github.com/hari-sikchi/DILO", "project": "", "reviewers": "5eEJ;U3CE;2kPL", "site": "https://openreview.net/forum?id=uHdVI3QMr6", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5634464023084393921&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;2", "aff_unique_norm": "University of Texas at Austin;University of Massachusetts Amherst;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.utexas.edu;https://www.umass.edu;https://meta.com", "aff_unique_abbr": "UT Austin;UMass Amherst;Meta", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Austin;Amherst;", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "uJBMZ6S02T", "title": "Real-to-Sim Grasp: Rethinking the Gap between Simulation and Real World in Grasp Detection", "track": "main", "status": "Poster", "tldr": "", "abstract": "For 6-DoF grasp detection, simulated data is expandable to train more powerful model, but it faces the challenge of the large gap between simulation and real world. Previous works bridge this gap with a sim-to-real way. However, this way explicitly or implicitly forces the simulated data to adapt to the noisy real data when training grasp detectors, where the positional drift and structural distortion within the camera noise will harm the grasp learning. In this work, we propose a Real-to-Sim framework for 6-DoF Grasp detection, named R2SGrasp, with the key insight of bridging this gap in a real-to-sim way, which directly bypasses the camera noise in grasp detector training through an inference-time real-to-sim adaption. To achieve this real-to-sim adaptation, our R2SGrasp designs the Real-to-Sim Data Repairer (R2SRepairer) to mitigate the camera noise of real depth maps in data-level, and the Real-to-Sim Feature Enhancer (R2SEnhancer) to enhance real features with precise simulated geometric primitives in feature-level. To endow our framework with the generalization ability, we construct a large-scale simulated dataset cost-efficiently to train our grasp detector, which includes 64,000 RGB-D images with 14.4 million grasp annotations. Sufficient experiments show that R2SGrasp is powerful and our real-to-sim perspective is effective. The real-world experiments further show great generalization ability of R2SGrasp. Project page is available on https://isee-laboratory.github.io/R2SGrasp.", "keywords": "Grasp pose detection;simulated datasets;sim-to-real", "primary_area": "", "supplementary_material": "/attachment/ed7f1698ed24313ab2d3cc0ba6bfd21fbc299385.zip", "author": "Jia-Feng Cai;Zibo Chen;Xiao-Ming Wu;Jian-Jian Jiang;Yi-Lin Wei;Wei-Shi Zheng", "authorids": "~Jia-Feng_Cai1;~Zibo_Chen1;~Xiao-Ming_Wu5;~Jian-Jian_Jiang1;~Yi-Lin_Wei1;~Wei-Shi_Zheng3", "gender": "M;;M;M;M;M", "homepage": "https://github.com/ZiLianC;;https://dravenalg.github.io/;https://github.com/JianJian-Jiang;http://www.isee-ai.cn/~zhwshi;", "dblp": ";;98/2898-2;;30/8399;376/2518.html", "google_scholar": ";;https://scholar.google.com/citations?hl=zh-CN;;AwqDDGoAAAAJ;", "orcid": ";0000-0003-0572-7076;0000-0003-1115-8551;;;", "linkedin": ";;;;;", "or_profile": "~Jia-Feng_Cai1;~Zibo_Chen1;~Xiao-Ming_Wu5;~Jian-Jian_Jiang1;~Wei-Shi_Zheng3;~yilin_wei1", "aff": "SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;Macquarie University;SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY;SUN YAT-SEN UNIVERSITY", "aff_domain": "sysu.edu.cn;sysu.edu.cn;mq.edu.au;sysu.edu.cn;sysu.edu.cn;sysu.edu.cn", "position": "MS student;MS student;Intern;MS student;Full Professor;PhD student", "bibtex": "@inproceedings{\ncai2024realtosim,\ntitle={Real-to-Sim Grasp: Rethinking the Gap between Simulation and Real World in Grasp Detection},\nauthor={Jia-Feng Cai and Zibo Chen and Xiao-Ming Wu and Jian-Jian Jiang and Yi-Lin Wei and Wei-Shi Zheng},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uJBMZ6S02T}\n}", "github": "", "project": "", "reviewers": "7Q2J;xCby;SR4p;TS41", "site": "https://openreview.net/forum?id=uJBMZ6S02T", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "3;3;5;3", "rating_avg": 3.0, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=3235381705674831491&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;1;0;0;0", "aff_unique_norm": "Sun Yat-sen University;Macquarie University", "aff_unique_dep": ";", "aff_unique_url": "http://www.sysu.edu.cn;https://www.mq.edu.au", "aff_unique_abbr": "SYSU;MQ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;Australia" }, { "id": "uMZ2jnZUDX", "title": "Learning H-Infinity Locomotion Control", "track": "main", "status": "Poster", "tldr": "", "abstract": "Stable locomotion in precipitous environments is an essential task for quadruped robots, requiring the ability to resist various external disturbances. Recent neural policies enhance robustness against disturbances by learning to resist external forces sampled from a fixed distribution in the simulated environment. However, the force generation process doesn\u2019t consider the robot\u2019s current state, making it difficult to identify the most effective direction and magnitude that can push the robot to the most unstable but recoverable state. Thus, challenging cases in the buffer are insufficient to optimize robustness. In this paper, we propose to model the robust locomotion learning process as an adversarial interaction between the locomotion policy and a learnable disturbance that is conditioned on the robot state to generate appropriate external forces. To make the joint optimization stable, our novel $H_{\\infty}$ constraint mandates the bound of the ratio between the cost and the intensity of the external forces. We verify the robustness of our approach in both simulated environments and real-world deployment, on quadrupedal locomotion tasks and a more challenging task where the quadruped performs locomotion merely on hind legs. Training and deployment code will be made public.", "keywords": "Robot Learning;Quadrupedal Robot;Robust Locomotion", "primary_area": "", "supplementary_material": "/attachment/fa0dda93cb0a787e8b186a014216092af3430f0e.zip", "author": "Junfeng Long;Wenye Yu;Quanyi Li;ZiRui Wang;Dahua Lin;Jiangmiao Pang", "authorids": "~Junfeng_Long1;~Wenye_Yu1;~Quanyi_Li1;~ZiRui_Wang8;~Dahua_Lin1;~Jiangmiao_Pang1", "gender": "M;M;M;M;M;M", "homepage": "https://junfeng-long.github.io/;https://virlus.github.io;https://quanyili.github.io;http://dahua.site;https://oceanpang.github.io/;https://github.com/Wongziseoi", "dblp": "343/2990;;270/7691;53/6088;231/7630;", "google_scholar": "olmfqBEAAAAJ;https://scholar.google.com/citations?hl=en;Ty49X3UAAAAJ;GMzzRRUAAAAJ;https://scholar.google.com/citations?authuser=0;https://scholar.google.com/citations?hl=zh-CN", "orcid": "0000-0001-7047-4963;;;;0000-0002-6711-9319;", "linkedin": ";;https://www.linkedin.com/mwlite/in/quanyi-li-2b7985183;;;", "or_profile": "~Junfeng_Long1;~Wenye_Yu1;~Quanyi_Li1;~Dahua_Lin1;~Jiangmiao_Pang1;~ZiRui_Wang4", "aff": "Shanghai AI Laboratory;Shanghai Jiaotong University;University of Edinburgh;The Chinese University of Hong Kong;Shanghai AI Laboratory ;Shanghai Artificial Intelligence Laboratory", "aff_domain": "pjlab.org.cn;sjtu.edu.cn;ed.ac.uk;cuhk.edu.hk;pjlab.org.cn;pjlab.org.cn", "position": "Researcher;Undergrad student;MS student;Associate Professor;Research Scientist;Intern", "bibtex": "@inproceedings{\nlong2024learning,\ntitle={Learning H-Infinity Locomotion Control},\nauthor={Junfeng Long and Wenye Yu and Quanyi Li and ZiRui Wang and Dahua Lin and Jiangmiao Pang},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=uMZ2jnZUDX}\n}", "github": "", "project": "", "reviewers": "YAqL;p1Hn;ATZL", "site": "https://openreview.net/forum?id=uMZ2jnZUDX", "pdf_size": 0, "rating": "2;3;4", "confidence": "4;5;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.8660254037844385, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9745543359720670367&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;3;0;4", "aff_unique_norm": "Shanghai AI Laboratory;Shanghai Jiao Tong University;University of Edinburgh;Chinese University of Hong Kong;Shanghai Artificial Intelligence Laboratory", "aff_unique_dep": ";;;;", "aff_unique_url": "https://www.shanghai-ai-lab.com;https://www.sjtu.edu.cn;https://www.ed.ac.uk;https://www.cuhk.edu.hk;http://www.shailab.org/", "aff_unique_abbr": "SAIL;SJTU;Edinburgh;CUHK;Shanghai AI Lab", "aff_campus_unique_index": "1", "aff_campus_unique": ";Hong Kong SAR", "aff_country_unique_index": "0;0;1;0;0;0", "aff_country_unique": "China;United Kingdom" }, { "id": "ubq7Co6Cbv", "title": "Gaussian Splatting to Real World Flight Navigation Transfer with Liquid Networks", "track": "main", "status": "Poster", "tldr": "", "abstract": "Simulators are powerful tools for autonomous robot learning as they offer scalable data generation, flexible design, and optimization of trajectories. \nHowever, transferring behavior learned from simulation data into the real world proves to be difficult, usually mitigated with compute-heavy domain randomization methods or further model fine-tuning. We present a method to improve generalization and robustness to distribution shifts in sim-to-real visual quadrotor navigation tasks. To this end, we first build a simulator by integrating Gaussian Splatting with quadrotor flight dynamics, and then, train robust navigation policies using Liquid neural networks. In this way, we obtain a full-stack imitation learning protocol that combines advances in 3D Gaussian splatting radiance field rendering, crafty programming of expert demonstration training data, and the task understanding capabilities of Liquid networks. Through a series of quantitative flight tests, we demonstrate the robust transfer of navigation skills learned in a single simulation scene directly to the real world. We further show the ability to maintain performance beyond the training environment under drastic distribution and physical environment changes. Our learned Liquid policies, trained on single target maneuvers curated from a photorealistic simulated indoor flight only, generalize to multi-step hikes onboard a real hardware platform outdoors.", "keywords": "End-to-end learning;Gaussian Splatting;Sim-to-real transfer", "primary_area": "", "supplementary_material": "/attachment/ca8f4d520c912e4c2f28d03751fc935009861969.zip", "author": "Alex Quach;Makram Chahine;Alexander Amini;Ramin Hasani;Daniela Rus", "authorids": "~Alex_Quach1;~Makram_Chahine1;~Alexander_Amini1;~Ramin_Hasani1;~Daniela_Rus1", "gender": "Not Specified;Not Specified;;F;M", "homepage": ";https://www.mit.edu/~chahine/;https://www.mit.edu/~amini;https://www.csail.mit.edu/person/daniela-rus;http://www.raminhasani.com", "dblp": ";271/6229;;r/DanielaRus;190/3168", "google_scholar": ";UzM0rckAAAAJ;EWB-8-oAAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.at/citations?user=YarJF3QAAAAJ", "orcid": ";;;;0000-0002-9889-5222", "linkedin": "alexhquach/;mc8/;;;raminhasani/", "or_profile": "~Alex_Quach1;~Makram_Chahine1;~Alexander_Amini1;~Daniela_Rus1;~Ramin_M._Hasani1", "aff": "Liquid AI;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology;Massachusetts Institute of Technology", "aff_domain": "liquid.ai;mit.edu;mit.edu;mit.edu;mit.edu", "position": "Researcher;PhD student;PhD student;Full Professor;Researcher", "bibtex": "@inproceedings{\nquach2024gaussian,\ntitle={Gaussian Splatting to Real World Flight Navigation Transfer with Liquid Networks},\nauthor={Alex Quach and Makram Chahine and Alexander Amini and Ramin Hasani and Daniela Rus},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ubq7Co6Cbv}\n}", "github": "https://github.com/alexquach/multienv_sim", "project": "", "reviewers": "wnWZ;iDUo;5nTe", "site": "https://openreview.net/forum?id=ubq7Co6Cbv", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;4;4", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16680844417971115994&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1", "aff_unique_norm": "Liquid AI;Massachusetts Institute of Technology", "aff_unique_dep": ";", "aff_unique_url": ";https://web.mit.edu", "aff_unique_abbr": ";MIT", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Unknown;United States" }, { "id": "ueBmGhLOXP", "title": "EquiBot: SIM(3)-Equivariant Diffusion Policy for Generalizable and Data Efficient Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Building effective imitation learning methods that enable robots to learn from limited data and still generalize across diverse real-world environments is a long-standing problem in robot learning. We propose EquiBot, a robust, data-efficient, and generalizable approach for robot manipulation task learning. Our approach combines SIM(3)-equivariant neural network architectures with diffusion models. This ensures that our learned policies are invariant to changes in scale, rotation, and translation, enhancing their applicability to unseen environments while retaining the benefits of diffusion-based policy learning such as multi-modality and robustness. We show on a suite of 6 simulation tasks that our proposed method reduces the data requirements and improves generalization to novel scenarios. In the real world, with 10 variations of 6 mobile manipulation tasks, we show that our method can easily generalize to novel objects and scenes after learning from just 5 minutes of human demonstrations in each task.", "keywords": "Imitation Learning;Equivariance;Data Efficiency", "primary_area": "", "supplementary_material": "/attachment/34ae72727ed89e2d69fe2287a71bd4b8d99e1300.zip", "author": "Jingyun Yang;Ziang Cao;Congyue Deng;Rika Antonova;Shuran Song;Jeannette Bohg", "authorids": "~Jingyun_Yang1;~Ziang_Cao2;~Congyue_Deng1;~Rika_Antonova1;~Shuran_Song3;~Jeannette_Bohg1", "gender": "M;;F;;F;", "homepage": "https://yjy0625.github.io;;https://cs.stanford.edu/~congyue/;;https://shurans.github.io/;https://web.stanford.edu/~bohg/", "dblp": ";;267/5521;;;52/7377", "google_scholar": "7XBAa2QAAAAJ;;XJZ8UBcAAAAJ;;https://scholar.google.com/citations?hl=en;rjnJnEkAAAAJ", "orcid": ";;;;;0000-0002-4921-7193", "linkedin": ";;;;;", "or_profile": "~Jingyun_Yang1;~Ziang_Cao2;~Congyue_Deng1;~Rika_Antonova1;~Shuran_Song3;~Jeannette_Bohg1", "aff": "Stanford University;;Stanford University;;Stanford University;Stanford University", "aff_domain": "stanford.edu;;stanford.edu;;stanford.edu;stanford.edu", "position": "PhD student;;PhD student;;Assistant Professor;Assistant Professor", "bibtex": "@inproceedings{\nyang2024equibot,\ntitle={EquiBot: {SIM}(3)-Equivariant Diffusion Policy for Generalizable and Data Efficient Learning},\nauthor={Jingyun Yang and Ziang Cao and Congyue Deng and Rika Antonova and Shuran Song and Jeannette Bohg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ueBmGhLOXP}\n}", "github": "https://github.com/yjy0625/equibot", "project": "", "reviewers": "U9p1;vrP9;8ciL", "site": "https://openreview.net/forum?id=ueBmGhLOXP", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 38, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17968417828206246442&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United States" }, { "id": "vBj5oC60Lk", "title": "Lifelong Autonomous Improvement of Navigation Foundation Models in the Wild", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent works have proposed a number of general-purpose robotic foundation models that can control a variety of robotic platforms to perform a range of different tasks, including in the domains of navigation and manipulation. However, such models are typically trained via imitation learning, which precludes the ability to improve autonomously through experience that the robot gathers on the job. In this work, our aim is to train general-purpose robotic foundation models in the domain of robotic navigation specifically with the aim of enabling autonomous self-improvement. We show that a combination of pretraining with offline reinforcement learning and a complete system for continual autonomous operation leads to a robotic learning framework that not only starts off with broad and diverse capabilities, but can further improve and adapt those capabilities in the course of carrying out navigational tasks in a given deployment location. To our knowledge, our model LiReN is the first navigation robot foundation model that is capable of fine-tuning with autonomous online data in open-world settings.", "keywords": "Navigation;Reinforcement Learning;Lifelong Learning", "primary_area": "", "supplementary_material": "/attachment/61425bc3c502d7782e8969c2d767d1821fd17091.zip", "author": "Kyle Stachowicz;Lydia Ignatova;Sergey Levine", "authorids": "~Kyle_Stachowicz1;~Lydia_Ignatova1;~Sergey_Levine1", "gender": "M;F;M", "homepage": "https://kylesta.ch;;https://people.eecs.berkeley.edu/~svlevine/", "dblp": ";;80/7594", "google_scholar": ";;8R35rCwAAAAJ", "orcid": ";;", "linkedin": ";lydia-ignatova/;", "or_profile": "~Kyle_Stachowicz1;~Lydia_Ignatova1;~Sergey_Levine1", "aff": "University of California, Berkeley;University of California, Berkeley;Google", "aff_domain": "berkeley.edu;berkeley.edu;google.com", "position": "PhD student;Undergrad student;Research Scientist", "bibtex": "@inproceedings{\nstachowicz2024lifelong,\ntitle={Lifelong Autonomous Improvement of Navigation Foundation Models in the Wild},\nauthor={Kyle Stachowicz and Lydia Ignatova and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vBj5oC60Lk}\n}", "github": "https://github.com/kylestach/lifelong-nav-rl", "project": "", "reviewers": "BE4N;p4fP;4zSa", "site": "https://openreview.net/forum?id=vBj5oC60Lk", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;4", "rating_avg": 3.6666666666666665, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13987589246311389586&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;0;1", "aff_unique_norm": "University of California, Berkeley;Google", "aff_unique_dep": ";Google", "aff_unique_url": "https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "UC Berkeley;Google", "aff_campus_unique_index": "0;0;1", "aff_campus_unique": "Berkeley;Mountain View", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "vhGkyWgctu", "title": "Learning Decentralized Multi-Biped Control for Payload Transport", "track": "main", "status": "Poster", "tldr": "", "abstract": "Payload transport over flat terrain via multi-wheel robot carriers is well-understood, highly effective, and configurable. In this paper, our goal is to provide similar effectiveness and configurability for transport over rough terrain that is more suitable for legs rather than wheels. For this purpose, we consider multi-biped robot carriers, where wheels are replaced by multiple bipedal robots attached to the carrier. Our main contribution is to design a decentralized controller for such systems that can be effectively applied to varying numbers and configurations of rigidly attached bipedal robots without retraining. We present a reinforcement learning approach for training the controller in simulation that supports transfer to the real world. Our experiments in simulation provide quantitative metrics showing the effectiveness of the approach over a wide variety of simulated transport scenarios. In addition, we demonstrate the controller in the real-world for systems composed of two and three Cassie robots. To our knowledge, this is the first example of a scalable multi-biped payload transport system.", "keywords": "Multi-robot Transport;Bipedal locomotion;Reinforcement Learning", "primary_area": "", "supplementary_material": "/attachment/a32cf900eff0cad82f64697d90d185edb95ffb3a.zip", "author": "Bikram Pandit;Ashutosh Gupta;Mohitvishnu S. Gadde;Addison Johnson;Aayam Kumar Shrestha;Helei Duan;Jeremy Dao;Alan Fern", "authorids": "~Bikram_Pandit1;~Ashutosh_Gupta3;~Mohitvishnu_S._Gadde1;~Addison_Johnson1;~Aayam_Kumar_Shrestha1;~Helei_Duan1;~Jeremy_Dao1;~Alan_Fern1", "gender": "M;M;M;;M;;M;M", "homepage": "https://bikcrum.com/;https://ashutosh781.github.io/;https://mohitgadde.com/;;https://idigitopia.github.io;;https://sites.google.com/view/jeremydao/home;http://www.eecs.oregonstate.edu/~afern", "dblp": ";;;;276/7259;;;49/6764", "google_scholar": "bikram.pandit;ZEtR-yMAAAAJ;zrve6q8AAAAJ;;S1gU5FkAAAAJ;;;https://scholar.google.com.tw/citations?user=GaKxFrcAAAAJ", "orcid": "0009-0007-3601-6118;;;;;;;", "linkedin": "bikcrum/;ashutosh-gupta781;;;aayamshrestha/;;;", "or_profile": "~Bikram_Pandit1;~Ashutosh_Gupta3;~Mohitvishnu_S._Gadde1;~Addison_Johnson1;~Aayam_Kumar_Shrestha1;~Helei_Duan1;~Jeremy_Dao1;~Alan_Fern1", "aff": "Oregon State University;Oregon State University;Oregon State University;;Oregon State University;;Oregon State University;Oregon State University", "aff_domain": "oregonstate.edu;oregonstate.edu;oregonstate.edu;;oregonstate.edu;;oregonstate.edu;oregonstate.edu", "position": "MS student;PhD student;PhD student;;PhD student;;PhD student;Full Professor", "bibtex": "@inproceedings{\npandit2024learning,\ntitle={Learning Decentralized Multi-Biped Control for Payload Transport},\nauthor={Bikram Pandit and Ashutosh Gupta and Mohitvishnu S. Gadde and Addison Johnson and Aayam Kumar Shrestha and Helei Duan and Jeremy Dao and Alan Fern},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vhGkyWgctu}\n}", "github": "https://github.com/osudrl/roadrunner/tree/paper/decmbc", "project": "", "reviewers": "wdBL;vVS6;eduL", "site": "https://openreview.net/forum?id=vhGkyWgctu", "pdf_size": 0, "rating": "3;3;4", "confidence": "5;3;4", "rating_avg": 3.3333333333333335, "confidence_avg": 4.0, "replies_avg": 4, "authors#_avg": 8, "corr_rating_confidence": 0.0, "gs_citation": 4, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7368557812048692045&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "Oregon State University", "aff_unique_dep": "", "aff_unique_url": "https://oregonstate.edu", "aff_unique_abbr": "OSU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "vobaOY0qDl", "title": "Jacta: A Versatile Planner for Learning Dexterous and Whole-body Manipulation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robotic manipulation is challenging due to discontinuous dynamics, as well as high-dimensional state and action spaces. Data-driven approaches that succeed in manipulation tasks require large amounts of data and expert demonstrations, typically from humans. Existing planners are restricted to specific systems and often depend on specialized algorithms for using demonstrations. Therefore, we introduce a flexible motion planner tailored to dexterous and whole-body manipulation tasks. Our planner creates readily usable demonstrations for reinforcement learning algorithms, eliminating the need for additional training pipeline complexities. With this approach, we can efficiently learn policies for complex manipulation tasks, where traditional reinforcement learning alone only makes little progress. Furthermore, we demonstrate that learned policies are transferable to real robotic systems for solving complex dexterous manipulation tasks.\n\nProject website: https://jacta-manipulation.github.io/", "keywords": "Dexterous Manipulation Planning;Learning with Demonstrations", "primary_area": "", "supplementary_material": "/attachment/a231db1f927dc40ce83481bb02129e97da7f2f86.zip", "author": "Jan Bruedigam;Ali Adeeb Abbas;Maks Sorokin;Kuan Fang;Brandon Hung;Maya Guru;Stefan Georg Sosnowski;Jiuguang Wang;Sandra Hirche;Simon Le Cleac'h", "authorids": "~Jan_Bruedigam1;~Ali_Adeeb_Abbas1;~Maks_Sorokin1;~Kuan_Fang3;bhung@theaiinstitute.com;mguru@theaiinstitute.com;~Stefan_Georg_Sosnowski1;~Jiuguang_Wang1;~Sandra_Hirche1;~Simon_Le_Cleac'h1", "gender": "M;;;;;;;;F;", "homepage": "https://www.ce.cit.tum.de/itr/bruedigam/;;;;;;;;http://www.itr.ei.tum.de;", "dblp": ";;;;;;;;89/6985;", "google_scholar": ";;;;;;;;;", "orcid": ";;;;;;;;;", "linkedin": ";;;;;;;;;", "or_profile": "~Jan_Bruedigam1;~Ali_Adeeb_Abbas1;~Maks_Sorokin1;~Kuan_Fang3;bhung@theaiinstitute.com;mguru@theaiinstitute.com;~Stefan_Georg_Sosnowski1;~Jiuguang_Wang1;~Sandra_Hirche1;~Simon_Le_Cleac'h1", "aff": "Technische Universit\u00e4t M\u00fcnchen;;;;;;;;Technical University Munich;", "aff_domain": "tum.de;;;;;;;;tum.de;", "position": "PhD student;;;;;;;;Full Professor;", "bibtex": "@inproceedings{\nbruedigam2024jacta,\ntitle={Jacta: A Versatile Planner for Learning Dexterous and Whole-body Manipulation},\nauthor={Jan Bruedigam and Ali Adeeb Abbas and Maks Sorokin and Kuan Fang and Brandon Hung and Maya Guru and Stefan Georg Sosnowski and Jiuguang Wang and Sandra Hirche and Simon Le Cleac'h},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vobaOY0qDl}\n}", "github": "", "project": "", "reviewers": "tB8p;RBak;YCCA;feZG", "site": "https://openreview.net/forum?id=vobaOY0qDl", "pdf_size": 0, "rating": "2;3;3;3", "confidence": "4;5;4;3", "rating_avg": 2.75, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 10, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8266428274165200943&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1", "aff_unique_norm": "Technische Universit\u00e4t M\u00fcnchen;Technical University of Munich", "aff_unique_dep": ";", "aff_unique_url": "https://www.tum.de;https://www.tum.de", "aff_unique_abbr": "TUM;TUM", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Germany" }, { "id": "vtEn8NJWlz", "title": "Learning Robotic Manipulation Policies from Point Clouds with Conditional Flow Matching", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning from expert demonstrations is a popular approach to train\nrobotic manipulation policies from limited data. However, imitation learning\nalgorithms require a number of design choices ranging from the input modality,\ntraining objective, and 6-DoF end-effector pose representation. Diffusion-based\nmethods have gained popularity as they allow to predict long horizon trajectories\nand handle multimodal action distributions. Recently, Conditional Flow Matching\n(CFM) (or Rectified Flow) has been proposed as a more flexible generalization\nof diffusion models. In this paper we investigate the application of CFM in the\ncontext of robotic policy learning, and specifically study the interplay with the\nother design choices required to build an imitation learning algorithm. We show\nthat CFM gives the best performance when combined with point cloud input\nobservations. Additionally, we study the feasibility of a CFM formulation on\nthe SO(3) manifold and evaluate its suitability with a simplified example. We\nperform extensive experiments on RLBench which demonstrate that our proposed\nPointFlowMatch approach achieves a state-of-the-art average success rate of 67.8%\nover eight tasks, double the performance of the next best method.", "keywords": "Imitation Learning;Manipulation;Conditional Flow Matching", "primary_area": "", "supplementary_material": "", "author": "Eugenio Chisari;Nick Heppert;Max Argus;Tim Welschehold;Thomas Brox;Abhinav Valada", "authorids": "~Eugenio_Chisari1;~Nick_Heppert1;~Max_Argus2;~Tim_Welschehold1;~Thomas_Brox1;~Abhinav_Valada1", "gender": ";M;;M;M;M", "homepage": "https://chisarie.github.io/;https://rl.uni-freiburg.de/people/heppert;http://www2.informatik.uni-freiburg.de/~twelsche/;https://lmb.informatik.uni-freiburg.de/people/brox/index.en.html;https://rl.uni-freiburg.de/people/valada;https://lmb.informatik.uni-freiburg.de/people/argusm/", "dblp": ";319/9587;;97/4586;81/9531;192/2010", "google_scholar": "cKvbzWsAAAAJ;btkVpywAAAAJ;https://scholar.google.de/citations?hl=en;https://scholar.google.com/citations?hl=de;https://scholar.google.de/citations?user=LcARjz0AAAAJ;", "orcid": "0000-0001-7928-9377;;;0000-0002-6282-8861;0000-0003-4710-3114;0000-0002-1288-7476", "linkedin": ";nick-heppert/;;;avalada;max-argus-5810636/", "or_profile": "~Eugenio_Chisari1;~Nick_Heppert1;~Tim_Welschehold1;~Thomas_Brox1;~Abhinav_Valada1;~Max_Argus1", "aff": "Universit\u00e4t Freiburg;University of Freiburg, Albert-Ludwigs-Universit\u00e4t Freiburg;Universit\u00e4t Freiburg;University of Freiburg;University of Freiburg;University of Freiburg, Albert-Ludwigs-Universit\u00e4t Freiburg", "aff_domain": "uni-freiburg.de;cs.uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;uni-freiburg.de;cs.uni-freiburg.de", "position": "PhD student;PhD student;Postdoc;Full Professor;Full Professor;Postdoc", "bibtex": "@inproceedings{\nchisari2024learning,\ntitle={Learning Robotic Manipulation Policies from Point Clouds with Conditional Flow Matching},\nauthor={Eugenio Chisari and Nick Heppert and Max Argus and Tim Welschehold and Thomas Brox and Abhinav Valada},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=vtEn8NJWlz}\n}", "github": "", "project": "", "reviewers": "mSNh;s1fJ;E1uS", "site": "https://openreview.net/forum?id=vtEn8NJWlz", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;2;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 14, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10888291235901096707&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0;0;0", "aff_unique_norm": "University of Freiburg", "aff_unique_dep": "", "aff_unique_url": "https://www.uni-freiburg.de", "aff_unique_abbr": "Uni Freiburg", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Freiburg", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "Germany" }, { "id": "wD2kUVLT1g", "title": "Equivariant Diffusion Policy", "track": "main", "status": "Poster", "tldr": "", "abstract": "Recent work has shown diffusion models are an effective approach to learning the multimodal distributions arising from demonstration data in behavior cloning. However, a drawback of this approach is the need to learn a denoising function, which is significantly more complex than learning an explicit policy. In this work, we propose Equivariant Diffusion Policy, a novel diffusion policy learning method that leverages domain symmetries to obtain better sample efficiency and generalization in the denoising function. We theoretically analyze the $\\mathrm{SO}(2)$ symmetry of full 6-DoF control and characterize when a diffusion model is $\\mathrm{SO}(2)$-equivariant. We furthermore evaluate the method empirically on a set of 12 simulation tasks in MimicGen, and show that it obtains a success rate that is, on average, 21.9\\% higher than the baseline Diffusion Policy. We also evaluate the method on a real-world system to show that effective policies can be learned with relatively few training samples, whereas the baseline Diffusion Policy cannot.", "keywords": "Equivariance;Diffusion Model;Robotic Manipulation", "primary_area": "", "supplementary_material": "/attachment/e6ffe9e59c9704d5270ab60801113a687cb1dc7f.zip", "author": "Dian Wang;Stephen Hart;David Surovik;Tarik Kelestemur;Haojie Huang;Haibo Zhao;Mark Yeatman;Jiuguang Wang;Robin Walters;Robert Platt", "authorids": "~Dian_Wang1;~Stephen_Hart3;~David_Surovik1;~Tarik_Kelestemur1;~Haojie_Huang1;~Haibo_Zhao2;myeatman@theaiinstitute.com;~Jiuguang_Wang1;~Robin_Walters1;~Robert_Platt1", "gender": "M;M;M;;M;M;;;M;", "homepage": "https://pointw.github.io/;;;https://kelestemur.com/;https://haojhuang.github.io/;https://haiboz.sites.northeastern.edu/;;;http://www.robinwalters.com;http://www.ccs.neu.edu/home/rplatt/", "dblp": "191/1369-1;;;;144/2195;;;;258/3416;39/5434", "google_scholar": "CckjtfQAAAAJ;;001dgZcAAAAJ;;https://scholar.google.com/citations?hl=zh-CN;;;;fnprJmUAAAAJ;Z4Y5S2oAAAAJ", "orcid": ";;;;;;;;;", "linkedin": "dianwang1007;stephen-hart-3711666/;;;;;;;;", "or_profile": "~Dian_Wang1;~Stephen_Hart3;~David_Surovik1;~Tarik_Kelestemur1;~Haojie_Huang1;~Haibo_Zhao2;myeatman@theaiinstitute.com;~Jiuguang_Wang1;~Robin_Walters1;~Robert_Platt1", "aff": "Northeastern University;The Robotics & AI Institute;The AI Institute ;Boston Dynamics AI Institute;Northeastern University;Northeastern University;;;Northeastern University ;Northeastern University", "aff_domain": "northeastern.edu;theaiinstitute.com;theaiinstitute.com;theaiinstitute.com;northeastern.edu;northeastern.edu;;;northeastern.edu;neu.edu", "position": "PhD student;Robotics Researcher;Researcher;Researcher;PhD student;MS student;;;Assistant Professor;Associate Professor", "bibtex": "@inproceedings{\nwang2024equivariant,\ntitle={Equivariant Diffusion Policy},\nauthor={Dian Wang and Stephen Hart and David Surovik and Tarik Kelestemur and Haojie Huang and Haibo Zhao and Mark Yeatman and Jiuguang Wang and Robin Walters and Robert Platt},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wD2kUVLT1g}\n}", "github": "", "project": "", "reviewers": "1QLT;TQso;T42w", "site": "https://openreview.net/forum?id=wD2kUVLT1g", "pdf_size": 0, "rating": "3;4;4", "confidence": "4;4;2", "rating_avg": 3.6666666666666665, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 10, "corr_rating_confidence": -0.5, "gs_citation": 26, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12279953908357781107&as_sdt=5,44&sciodt=0,44&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;2;3;0;0;0;0", "aff_unique_norm": "Northeastern University;Robotics & AI Institute;AI Institute;Boston Dynamics AI Institute", "aff_unique_dep": ";;;AI Institute", "aff_unique_url": "https://www.northeastern.edu;;;https://www.bostondynamics.com/", "aff_unique_abbr": "NEU;;;BD AI", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0", "aff_country_unique": "United States;" }, { "id": "wH7Wv0nAm8", "title": "Bi-Level Motion Imitation for Humanoid Robots", "track": "main", "status": "Poster", "tldr": "", "abstract": "Imitation learning from human motion capture (MoCap) data provides a promising way to train humanoid robots. However, due to differences in morphology, such as varying degrees of joint freedom and force limits, exact replication of human behaviors may not be feasible for humanoid robots. Consequently, incorporating physically infeasible MoCap data in training datasets can adversely affect the performance of the robot policy. To address this issue, we propose a bi-level optimization-based imitation learning framework that alternates between optimizing both the robot policy and the target MoCap data. Specifically, we first develop a generative latent dynamics model using a novel self-consistent auto-encoder, which learns sparse and structured motion representations while capturing desired motion patterns in the dataset. The dynamics model is then utilized to generate reference motions while the latent representation regularizes the bi-level motion imitation process. Simulations conducted with a realistic model of a humanoid robot demonstrate that our method enhances the robot policy by modifying reference motions to be physically consistent.", "keywords": "Humanoid Robots;Imitation Learning;Latent Dynamics Model", "primary_area": "", "supplementary_material": "/attachment/8051ce1a305a792b766a2b296ac5ab41f3f1a7f8.zip", "author": "Wenshuai Zhao;Yi Zhao;Joni Pajarinen;Michael Muehlebach", "authorids": "~Wenshuai_Zhao1;~Yi_Zhao6;~Joni_Pajarinen2;~Michael_Muehlebach1", "gender": "M;M;;", "homepage": "https://wenshuaizhao.github.io/;https://zhaoyi11.github.io/;;https://sites.google.com/view/mmuehlebach/", "dblp": "246/5109;51/4138-1;23/8355;142/1129", "google_scholar": "cuNOys8AAAAJ;https://scholar.google.com/citations?hl=en;https://scholar.google.fi/citations?user=-2fJStwAAAAJ;uTfYBAsAAAAJ", "orcid": ";0009-0002-9979-595X;0000-0003-4469-8191;", "linkedin": ";;;", "or_profile": "~Wenshuai_Zhao1;~Yi_Zhao6;~Joni_Pajarinen2;~Michael_Muehlebach1", "aff": "Aalto University;Max Planck Institute for Intelligent Systems;Aalto University;Max-Planck Institute", "aff_domain": "aalto.fi;mpg.tuebingen.de;aalto.fi;mpg.de", "position": "PhD student;Intern;Assistant Professor;Principal Researcher", "bibtex": "@inproceedings{\nzhao2024bilevel,\ntitle={Bi-Level Motion Imitation for Humanoid Robots},\nauthor={Wenshuai Zhao and Yi Zhao and Joni Pajarinen and Michael Muehlebach},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wH7Wv0nAm8}\n}", "github": "", "project": "", "reviewers": "p8L1;k7pZ;y7N1", "site": "https://openreview.net/forum?id=wH7Wv0nAm8", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;4", "rating_avg": 3.0, "confidence_avg": 3.3333333333333335, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9314035517620644868&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;0;2", "aff_unique_norm": "Aalto University;Max Planck Institute for Intelligent Systems;Max-Planck-Gesellschaft zur F\u00f6rderung der Wissenschaften e.V.", "aff_unique_dep": ";Intelligent Systems;", "aff_unique_url": "https://www.aalto.fi;https://www.mpi-is.mpg.de;https://www.mpg.de", "aff_unique_abbr": "Aalto;MPI-IS;MPG", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;0;1", "aff_country_unique": "Finland;Germany" }, { "id": "wSWMsjuMTI", "title": "ManiWAV: Learning Robot Manipulation from In-the-Wild Audio-Visual Data", "track": "main", "status": "Poster", "tldr": "", "abstract": "Audio signals provide rich information for the robot interaction and object properties through contact. These information can surprisingly ease the learning of contact-rich robot manipulation skills, especially when the visual information alone is ambiguous or incomplete. However, the usage of audio data in robot manipulation has been constrained to teleoperated demonstrations collected by either attaching a microphone to the robot or object, which significantly limits its usage in robot learning pipelines. In this work, we introduce ManiWAV: an 'ear-in-hand' data collection device to collect in-the-wild human demonstrations with synchronous audio and visual feedback, and a corresponding policy interface to learn robot manipulation policy directly from the demonstrations. We demonstrate the capabilities of our system through four contact-rich manipulation tasks that require either passively sensing the contact events and modes, or actively sensing the object surface materials and states. In addition, we show that our system can generalize to unseen in-the-wild environments, by learning from diverse in-the-wild human demonstrations. All data, code, and policy will be public.", "keywords": "Robot Manipulation;Imitation Learning;Audio", "primary_area": "", "supplementary_material": "/attachment/90b20669b0344cdc27bda78be5ae4aad3cdba626.zip", "author": "Zeyi Liu;Cheng Chi;Eric Cousineau;Naveen Kuppuswamy;Benjamin Burchfiel;Shuran Song", "authorids": "~Zeyi_Liu1;~Cheng_Chi4;~Eric_Cousineau1;~Naveen_Kuppuswamy1;~Benjamin_Burchfiel1;~Shuran_Song3", "gender": "F;M;M;M;M;F", "homepage": "https://lzylucy.github.io;https://cheng-chi.github.io/;https://eacousineau.com/;https://naveenoid.wordpress.com/;http://www.benburchfiel.com/;https://shurans.github.io/", "dblp": ";;;;136/9247;", "google_scholar": ";EO0PHdAAAAAJ;LOTPw48AAAAJ;VIGQK8wAAAAJ;eGoTK1YAAAAJ;https://scholar.google.com/citations?hl=en", "orcid": ";;;;;", "linkedin": "zeyi-liu;;eacousineau;;benburchfiel/;", "or_profile": "~Zeyi_Liu1;~Cheng_Chi4;~Eric_Cousineau1;~Naveen_Kuppuswamy1;~Benjamin_Burchfiel1;~Shuran_Song3", "aff": "Stanford University;Stanford University;Toyota Research Institute;Toyota Research Institute;Dexterous Manipulation Group, Toyota Research Institute;Stanford University", "aff_domain": "stanford.edu;stanford.edu;tri.global;tri.global;tri.global;stanford.edu", "position": "PhD student;PhD student;Researcher;Assistant Professor;Researcher;Assistant Professor", "bibtex": "@inproceedings{\nliu2024maniwav,\ntitle={Mani{WAV}: Learning Robot Manipulation from In-the-Wild Audio-Visual Data},\nauthor={Zeyi Liu and Cheng Chi and Eric Cousineau and Naveen Kuppuswamy and Benjamin Burchfiel and Shuran Song},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wSWMsjuMTI}\n}", "github": "https://github.com/real-stanford/maniwav", "project": "", "reviewers": "7TLA;AFoB;cdf4", "site": "https://openreview.net/forum?id=wSWMsjuMTI", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;5", "rating_avg": 3.0, "confidence_avg": 4.333333333333333, "replies_avg": 5, "authors#_avg": 6, "corr_rating_confidence": 0.0, "gs_citation": 24, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7032372995606756233&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;1;1;1;0", "aff_unique_norm": "Stanford University;Toyota Research Institute", "aff_unique_dep": ";", "aff_unique_url": "https://www.stanford.edu;https://www.tri.global", "aff_unique_abbr": "Stanford;TRI", "aff_campus_unique_index": "0;0;0", "aff_campus_unique": "Stanford;", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "wTKJge0PTq", "title": "HiRT: Enhancing Robotic Control with Hierarchical Robot Transformers", "track": "main", "status": "Poster", "tldr": "", "abstract": "Large Vision-Language-Action (VLA) models, leveraging powerful pre-trained Vision-Language Models (VLMs) backends, have shown promise in robotic control due to their impressive generalization ability. However, the success comes at a cost. Their reliance on VLM backends with billions of parameters leads to high computational costs and inference latency, limiting the testing scenarios to mainly quasi-static tasks and hindering performance in dynamic tasks requiring rapid interactions. To address these limitations, this paper proposes \\textbf{HiRT}, a \\textbf{Hi}erarchical \\textbf{R}obot \\textbf{T}ransformer framework that enables flexible frequency and performance trade-off. HiRT keeps VLMs running at low frequencies to capture temporarily invariant features while enabling real-time interaction through a high-frequency vision-based policy guided by the slowly updated features. Experiment results in both simulation and real-world settings demonstrate significant improvements over baseline methods. Empirically, we achieve a 58\\% reduction in inference time delay while maintaining comparable success rates. Additionally, on novel dynamic manipulation benchmarks which are challenging for previous VLA models, HiRT improves the success rate from 48% to 75%.", "keywords": "Imitation Learning;Robots;Vision Language Models", "primary_area": "", "supplementary_material": "/attachment/0297ccb815516249c73b927a9208ee77839035d7.zip", "author": "Jianke Zhang;Yanjiang Guo;Xiaoyu Chen;Yen-Jen Wang;Yucheng Hu;Chengming Shi;Jianyu Chen", "authorids": "~Jianke_Zhang1;~Yanjiang_Guo1;~Xiaoyu_Chen4;~Yen-Jen_Wang1;~Yucheng_Hu1;~Chengming_Shi1;~Jianyu_Chen1", "gender": "M;M;;M;M;M;M", "homepage": ";https://robert-gyj.github.io/;https://github.com/Cospui;https://wangyenjen.github.io;https://github.com/Hu-Yuch;;http://people.iiis.tsinghua.edu.cn/~jychen/", "dblp": ";;;164/2206;;;", "google_scholar": "https://scholar.google.com/citations?hl=zh-CN;rBeZZPMAAAAJ;;_U-HwfkAAAAJ;;;", "orcid": ";;;;;;", "linkedin": ";;;wangyenjen/;;http://www.linkedin.com/in/chengming-ruby-shi;", "or_profile": "~Jianke_Zhang1;~Yanjiang_Guo1;~Xiaoyu_Chen4;~Yen-Jen_Wang1;~Yucheng_Hu1;~Chengming_Shi1;~Jianyu_Chen1", "aff": "Beijing Institute of Technology;Tsinghua University;Tsinghua University;Tsinghua University;Tsinghua University;;Tsinghua University", "aff_domain": "bit.edu.cn;mails.tsinghua.edu.cn;tsinghua.edu.cn;mails.tsinghua.edu.cn;mail.tsinghua.edu.cn;;tsinghua.edu.cn", "position": "Undergrad student;PhD student;Graduate student;MS student;Undergrad student;;Assistant Professor", "bibtex": "@inproceedings{\nzhang2024hirt,\ntitle={Hi{RT}: Enhancing Robotic Control with Hierarchical Robot Transformers},\nauthor={Jianke Zhang and Yanjiang Guo and Xiaoyu Chen and Yen-Jen Wang and Yucheng Hu and Chengming Shi and Jianyu Chen},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wTKJge0PTq}\n}", "github": "", "project": "", "reviewers": "FEy5;4RK9;vpoa", "site": "https://openreview.net/forum?id=wTKJge0PTq", "pdf_size": 0, "rating": "2;3;3", "confidence": "5;5;4", "rating_avg": 2.6666666666666665, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": -0.4999999999999999, "gs_citation": 8, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8701447161649298373&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;1;1", "aff_unique_norm": "Beijing Institute of Technology;Tsinghua University", "aff_unique_dep": ";", "aff_unique_url": "http://www.bit.edu.cn/;https://www.tsinghua.edu.cn", "aff_unique_abbr": "BIT;THU", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0", "aff_country_unique": "China" }, { "id": "wcbrhPnOei", "title": "RobotKeyframing: Learning Locomotion with High-Level Objectives via Mixture of Dense and Sparse Rewards", "track": "main", "status": "Poster", "tldr": "", "abstract": "This paper presents a novel learning-based control framework that uses keyframing to incorporate high-level objectives in natural locomotion for legged robots. These high-level objectives are specified as a variable number of partial or complete pose targets that are spaced arbitrarily in time. Our proposed framework utilizes a multi-critic reinforcement learning algorithm to effectively handle the mixture of dense and sparse rewards. Additionally, it employs a transformer-based encoder to accommodate a variable number of input targets, each associated with specific time-to-arrivals. Throughout simulation and hardware experiments, we demonstrate that our framework can effectively satisfy the target keyframe sequence at the required times. In the experiments, the multi-critic method significantly reduces the effort of hyperparameter tuning compared to the standard single-critic alternative. Moreover, the proposed transformer-based architecture enables robots to anticipate future goals, which results in quantitative improvements in their ability to reach their targets.", "keywords": "Legged robots;Multi-Critic Reinforcement Learning;Motion Imitation", "primary_area": "", "supplementary_material": "/attachment/699f68a71dac78f35e395ae87229c704c0a61e98.zip", "author": "Fatemeh Zargarbashi;Jin Cheng;Dongho Kang;Robert Sumner;Stelian Coros", "authorids": "~Fatemeh_Zargarbashi1;~Jin_Cheng1;~Dongho_Kang1;~Robert_Sumner1;~Stelian_Coros1", "gender": "F;M;M;M;M", "homepage": ";https://jin-cheng.me/;https://donghok.me/;https://studios.disneyresearch.com/people/bob-sumner/;http://crl.ethz.ch/index.html", "dblp": ";;;;", "google_scholar": "zCnF2uIAAAAJ;jHsJrX8AAAAJ;east0822;;sX31JjwAAAAJ", "orcid": "0009-0001-9734-2693;0000-0001-9822-3701;;;", "linkedin": ";jin-cheng-886462163/;kangdongho/;;", "or_profile": "~Fatemeh_Zargarbashi1;~Jin_Cheng1;~Dongho_Kang1;~Robert_Sumner1;~Stelian_Coros1", "aff": "Disney Research|Studios;ETHZ - ETH Zurich;ETHZ - ETH Zurich;Disney Research, Disney Research;ETHZ - ETH Zurich", "aff_domain": "disneyresearch.com;ethz.ch;ethz.ch;disneyresearch.com;ethz.ch", "position": "PhD student;PhD student;PhD student;Principal Researcher;Associate Professor", "bibtex": "@inproceedings{\nzargarbashi2024robotkeyframing,\ntitle={RobotKeyframing: Learning Locomotion with High-Level Objectives via Mixture of Dense and Sparse Rewards},\nauthor={Fatemeh Zargarbashi and Jin Cheng and Dongho Kang and Robert Sumner and Stelian Coros},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=wcbrhPnOei}\n}", "github": "", "project": "", "reviewers": "fZ8e;i8Sh;aL5k", "site": "https://openreview.net/forum?id=wcbrhPnOei", "pdf_size": 0, "rating": "2;3;4", "confidence": "5;5;4", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": -0.8660254037844385, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=13061473486680887560&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;1;0;1", "aff_unique_norm": "Disney Research;ETH Zurich", "aff_unique_dep": "Research;", "aff_unique_url": "https://research.disney.com;https://www.ethz.ch", "aff_unique_abbr": "Disney Research;ETHZ", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;1;1;0;1", "aff_country_unique": "United States;Switzerland" }, { "id": "xYJn2e1uu8", "title": "Sparsh: Self-supervised touch representations for vision-based tactile sensing", "track": "main", "status": "Poster", "tldr": "", "abstract": "In this work, we introduce general purpose touch representations for the increasingly accessible class of vision-based tactile sensors. Such sensors have led to many recent advances in robot manipulation as they markedly complement vision, yet solutions today often rely on task and sensor specific handcrafted perception models. Collecting real data at scale with task centric ground truth labels, like contact forces and slip, is a challenge further compounded by sensors of various form factor differing in aspects like lighting and gel markings. To tackle this, we turn to self-supervised learning (SSL) that has demonstrated remarkable performance in computer vision. We present Sparsh, a family of SSL models that can support various vision-based tactile sensors, alleviating the need for custom labels through pre-training on 460k+ tactile images with masking and self-distillation in pixel and latent spaces. We also build TacBench, to facilitate standardized benchmarking across sensors and models, comprising of six tasks ranging from comprehending tactile properties to enabling physical perception and manipulation planning. In evaluations, we find that SSL pre-training for touch representation outperforms task and sensor-specific end-to-end training by 95.1% on average over TacBench, and Sparsh (DINO) and Sparsh (IJEPA) are the most competitive, indicating the merits of learning in latent space for tactile images. Project page: https://sparsh-ssl.github.io", "keywords": "Tactile sensing;Pre-trained representations;Self-supervised learning", "primary_area": "", "supplementary_material": "/attachment/41861ffc2064440bab92607fd7b5bfcbaa773504.zip", "author": "Carolina Higuera;Akash Sharma;Chaithanya Krishna Bodduluri;Taosha Fan;Patrick Lancaster;Mrinal Kalakrishnan;Michael Kaess;Byron Boots;Mike Lambeta;Tingfan Wu;Mustafa Mukadam", "authorids": "~Carolina_Higuera1;~Akash_Sharma1;~Chaithanya_Krishna_Bodduluri1;~Taosha_Fan1;~Patrick_Lancaster1;~Mrinal_Kalakrishnan1;~Michael_Kaess1;~Byron_Boots1;~Mike_Lambeta1;~Tingfan_Wu2;~Mustafa_Mukadam1", "gender": "F;M;M;;M;M;M;;M;M;M", "homepage": ";https://akashsharma02.github.io;;https://github.com/fantaosha;https://palanc.github.io;;https://www.cs.cmu.edu/~kaess/;;;;http://www.mustafamukadam.com", "dblp": ";;;;;46/4195;26/6036;;;;", "google_scholar": "https://scholar.google.es/citations?hl=es;LhKc2CsAAAAJ;;;e9MgnYYAAAAJ;DMTuJzAAAAAJ;27eupmsAAAAJ;;;https://scholar.google.com/citations?hl=en;yYpm9LoAAAAJ", "orcid": "0000-0001-5141-0817;;;;;;0000-0002-7590-3357;;;;", "linkedin": ";;krishna-bck;;;mrinalkalakrishnan/;michaelkaess/;;mike-maroje-lambeta;;mhmukadam/", "or_profile": "~Carolina_Higuera1;~Akash_Sharma1;~Chaithanya_Krishna_Bodduluri1;~Taosha_Fan1;~Patrick_Lancaster1;~Mrinal_Kalakrishnan1;~Michael_Kaess1;~Byron_Boots1;~Mike_Lambeta1;~Tingfan_Wu2;~Mustafa_Mukadam1", "aff": "University of Washington;Carnegie Mellon University;Meta Facebook;;Meta;Meta;Carnegie Mellon University;;Meta;;Meta AI", "aff_domain": "uw.edu;cs.cmu.edu;meta.com;;meta.com;meta.com;cmu.edu;;meta.com;;meta.com", "position": "PhD student;PhD student;Researcher;;Postdoc;Researcher;Associate Professor;;Engineer;;Researcher", "bibtex": "@inproceedings{\nhiguera2024sparsh,\ntitle={Sparsh: Self-supervised touch representations for vision-based tactile sensing},\nauthor={Carolina Higuera and Akash Sharma and Chaithanya Krishna Bodduluri and Taosha Fan and Patrick Lancaster and Mrinal Kalakrishnan and Michael Kaess and Byron Boots and Mike Lambeta and Tingfan Wu and Mustafa Mukadam},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xYJn2e1uu8}\n}", "github": "https://github.com/facebookresearch/sparsh", "project": "", "reviewers": "B6vw;EZmQ;brZ3", "site": "https://openreview.net/forum?id=xYJn2e1uu8", "pdf_size": 0, "rating": "3;3;3", "confidence": "5;4;5", "rating_avg": 3.0, "confidence_avg": 4.666666666666667, "replies_avg": 5, "authors#_avg": 11, "corr_rating_confidence": 0.0, "gs_citation": 10, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=9014749620569693014&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;2;2;1;2;2", "aff_unique_norm": "University of Washington;Carnegie Mellon University;Meta", "aff_unique_dep": ";;Meta Platforms, Inc.", "aff_unique_url": "https://www.washington.edu;https://www.cmu.edu;https://meta.com", "aff_unique_abbr": "UW;CMU;Meta", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "xYleTh2QhS", "title": "Adaptive Diffusion Terrain Generator for Autonomous Uneven Terrain Navigation", "track": "main", "status": "Poster", "tldr": "", "abstract": "Model-free reinforcement learning has emerged as a powerful method for developing robust robot control policies capable of navigating through complex and unstructured terrains.\nThe effectiveness of these methods hinges on two essential elements: \n(1) the use of massively parallel physics simulations to expedite policy training, \nand\n(2) the deployment of an environment generator tasked with crafting terrains that are sufficiently challenging yet attainable, thereby facilitating continuous policy improvement. \nExisting methods of environment generation often rely on heuristics constrained by a set of parameters, limiting the diversity and realism.\nIn this work, we introduce the Adaptive Diffusion Terrain Generator (ADTG), a novel method that leverages Denoising Diffusion Probabilistic Models (DDPMs) to dynamically expand an existing training environment by adding more diverse and complex terrains tailored to the current policy.\nUnlike conventional methods, ADTG adapts the terrain complexity and variety based on the evolving capabilities of the current policy.\nThis is achieved through two primary mechanisms:\nFirst, by blending terrains from the initial dataset within their latent spaces using performance-informed weights, ADTG creates terrains that suitably challenge the policy. \nSecondly, by manipulating the initial noise in the diffusion process, ADTG seamlessly shifts between creating similar terrains for fine-tuning the current policy and entirely novel ones for expanding training diversity.\nOur experiments show that the policy trained by ADTG outperforms both procedural generated and natural environments, along with popular navigation methods.", "keywords": "Curriculum Reinforcement Learning;Diffusion Model;Field Robotics", "primary_area": "", "supplementary_material": "/attachment/afd2a43c61d2c994c167cf282f758b98b0246643.zip", "author": "Youwei Yu;Junhong Xu;Lantao Liu", "authorids": "~Youwei_Yu1;~Junhong_Xu1;~Lantao_Liu1", "gender": "M;M;Not Specified", "homepage": "https://youwei-yu.com;https://junhongxu.github.io/;", "dblp": ";;", "google_scholar": "4U0U9fQAAAAJ;;L5dHk5cAAAAJ", "orcid": ";;", "linkedin": "youwei-fisher/;;", "or_profile": "~Youwei_Yu1;~Junhong_Xu1;~Lantao_Liu1", "aff": "Indiana University;Indiana University, Bloomington;Indiana University, Bloomington", "aff_domain": "iu.edu;iu.edu;iu.edu", "position": "PhD student;PhD student;Assistant Professor", "bibtex": "@inproceedings{\nyu2024adaptive,\ntitle={Adaptive Diffusion Terrain Generator for Autonomous Uneven Terrain Navigation},\nauthor={Youwei Yu and Junhong Xu and Lantao Liu},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xYleTh2QhS}\n}", "github": "", "project": "", "reviewers": "NMmZ;rh2f;RP3g;1ULr", "site": "https://openreview.net/forum?id=xYleTh2QhS", "pdf_size": 0, "rating": "2;2;3;4", "confidence": "5;3;4;4", "rating_avg": 2.75, "confidence_avg": 4.0, "replies_avg": 6, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14704761638931225827&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;0;0", "aff_unique_norm": "Indiana University", "aff_unique_dep": "", "aff_unique_url": "https://www.indiana.edu", "aff_unique_abbr": "IU", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Bloomington", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "xcBH8Jhmbi", "title": "Discovering Robotic Interaction Modes with Discrete Representation Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Abstract: Human actions manipulating articulated objects, such as opening and closing a drawer, can be categorized into multiple modalities we define as interaction modes. Traditional robot learning approaches lack discrete representations of these modes, which are crucial for empirical sampling and grounding. In this paper, we present ActAIM2, which learns a discrete representation of robot manipulation interaction modes in a purely unsupervised fashion, without the use of expert labels or simulator-based privileged information. Utilizing novel data collection methods involving simulator rollouts, ActAIM2 consists of an interaction mode selector and a low-level action predictor. The selector generates discrete representations of potential interaction modes with self-supervision, while the predictor outputs\ncorresponding action trajectories. Our method is validated through its success rate in manipulating articulated objects and its robustness in sampling meaningful actions from the discrete representation. Extensive experiments demonstrate ActAIM2\u2019s effectiveness in enhancing manipulability and generalizability over baselines and ablation studies. For videos and additional results, see our website: https://actaim2.github.io/.", "keywords": "Discovering Robotic Interaction Modes with Discrete Representation Learning", "primary_area": "", "supplementary_material": "/attachment/74dc322dfb8f588a91b7d18f0b1e68c3c03b1ac2.zip", "author": "Liquan Wang;Ankit Goyal;Haoping Xu;Animesh Garg", "authorids": "~Liquan_Wang2;~Ankit_Goyal1;~Haoping_Xu1;~Animesh_Garg1", "gender": "M;M;M;M", "homepage": "https://www.linkedin.com/in/liquan-wang-a37634196/;http://imankgoyal.github.io/;;http://animesh.garg.tech", "dblp": ";89/10051-1;;123/5728", "google_scholar": ";RhN6jKIAAAAJ;9mD-LUMAAAAJ;zp8V7ZMAAAAJ", "orcid": ";;;0000-0003-0482-4296", "linkedin": ";;;animeshgarg/", "or_profile": "~Liquan_Wang2;~Ankit_Goyal1;~Haoping_Xu1;~Animesh_Garg1", "aff": "Department of Computer Science;NVIDIA;Toronto University;NVIDIA", "aff_domain": "cs.toronto.edu;nvidia.com;utoronto.ca;nvidia.com", "position": "PhD student;Researcher;PhD student;Researcher", "bibtex": "@inproceedings{\nwang2024discovering,\ntitle={Discovering Robotic Interaction Modes with Discrete Representation Learning},\nauthor={Liquan Wang and Ankit Goyal and Haoping Xu and Animesh Garg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xcBH8Jhmbi}\n}", "github": "https://github.com/pairlab/ActAIM.git", "project": "", "reviewers": "DsEP;ituT;BZdG", "site": "https://openreview.net/forum?id=xcBH8Jhmbi", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 1, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=8435615671339628282&as_sdt=400005&sciodt=0,14&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;1", "aff_unique_norm": "Unknown Institution;NVIDIA;University of Toronto", "aff_unique_dep": "Department of Computer Science;NVIDIA Corporation;", "aff_unique_url": ";https://www.nvidia.com;https://www.utoronto.ca", "aff_unique_abbr": ";NVIDIA;U of T", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "1;2;1", "aff_country_unique": ";United States;Canada" }, { "id": "xeFKtSXPMd", "title": "OCCAM: Online Continuous Controller Adaptation with Meta-Learned Models", "track": "main", "status": "Poster", "tldr": "", "abstract": "Control tuning and adaptation present a significant challenge to the usage of robots in diverse environments. It is often nontrivial to find a single set of control parameters by hand that work well across the broad array of environments and conditions that a robot might encounter. Automated adaptation approaches must utilize prior knowledge about the system while adapting to significant domain shifts to find new control parameters quickly. In this work, we present a general framework for online controller adaptation that deals with these challenges. We combine meta-learning with Bayesian recursive estimation to learn prior predictive models of system performance that quickly adapt to online data, even when there is significant domain shift. These predictive models can be used as cost functions within efficient sampling-based optimization routines to find new control parameters online that maximize system performance. Our framework is powerful and flexible enough to adapt controllers for four diverse systems: a simulated race car, a simulated quadrupedal robot, and a simulated and physical quadrotor.", "keywords": "Controller Adaptation;Robot Model Learning;Meta-Learning", "primary_area": "", "supplementary_material": "/attachment/dc60e7428b40010cc1050acb1e70188ff827096e.zip", "author": "Hersh Sanghvi;Spencer Folk;Camillo Jose Taylor", "authorids": "~Hersh_Sanghvi1;~Spencer_Folk1;~Camillo_Jose_Taylor2", "gender": "M;Not Specified;M", "homepage": "https://hersh500.github.io/;;https://www.cis.upenn.edu/~cjtaylor/", "dblp": "309/6036;;t/CamilloJTaylor.html", "google_scholar": "kewZ28IAAAAJ;gQTeydkAAAAJ;r50jBCUAAAAJ", "orcid": ";0009-0001-9646-2415;", "linkedin": ";spencer-folk/;", "or_profile": "~Hersh_Sanghvi1;~Spencer_Folk1;~Camillo_Jose_Taylor2", "aff": "University of Pennsylvania;University of Pennsylvania;University of Pennsylvania", "aff_domain": "seas.upenn.edu;seas.upenn.edu;upenn.edu", "position": "PhD student;PhD student;Full Professor", "bibtex": "@inproceedings{\nsanghvi2024occam,\ntitle={{OCCAM}: Online Continuous Controller Adaptation with Meta-Learned Models},\nauthor={Hersh Sanghvi and Spencer Folk and Camillo Jose Taylor},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=xeFKtSXPMd}\n}", "github": "", "project": "", "reviewers": "vgNh;27Pw;vgNE", "site": "https://openreview.net/forum?id=xeFKtSXPMd", "pdf_size": 0, "rating": "4;4;4", "confidence": "5;3;4", "rating_avg": 4.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 2, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=15203775767195532351&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0", "aff_unique_norm": "University of Pennsylvania", "aff_unique_dep": "", "aff_unique_url": "https://www.upenn.edu", "aff_unique_abbr": "UPenn", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States" }, { "id": "y8XkuQIrvI", "title": "MILES: Making Imitation Learning Easy with Self-Supervision", "track": "main", "status": "Poster", "tldr": "", "abstract": "Data collection in imitation learning often requires significant, laborious human supervision, such as numerous demonstrations, and/or frequent environment resets for methods that incorporate reinforcement learning. In this work, we propose an alternative approach, MILES: a fully autonomous, self-supervised data collection paradigm, and we show that this enables efficient policy learning from just a single demonstration and a single environment reset. MILES autonomously learns a policy for returning to and then following the single demonstration, whilst being self-guided during data collection, eliminating the need for additional human interventions. We evaluated MILES across several realworld tasks, including tasks that require precise contact-rich manipulation such as locking a lock with a key. We found that, under the constraints of a single demonstration and no repeated environment resetting, MILES significantly outperforms state-of-the-art alternatives like imitation learning methods that leverage reinforcement learning. Videos of our experiments and code can be found on our webpage: www.robot-learning.uk/miles.", "keywords": "Imitation Learning;Robotic Manipulation;Self-Supervised Data Collection", "primary_area": "", "supplementary_material": "/attachment/e51da0efd5963d23dfbe28f0a79566121c62c8fb.zip", "author": "Georgios Papagiannis;Edward Johns", "authorids": "~Georgios_Papagiannis1;~Edward_Johns1", "gender": "M;", "homepage": "https://www.robot-learning.uk;", "dblp": "68/9968;", "google_scholar": "https://scholar.google.co.uk/citations?user=sMIUkiQAAAAJ;", "orcid": "0000-0002-8914-8786;", "linkedin": "https://uk.linkedin.com/in/edward-johns-1b24845a;george-papagiannis-85b91011a", "or_profile": "~Edward_Johns1;~George_Papagiannis1", "aff": "Imperial College London;Imperial College London", "aff_domain": "imperial.ac.uk;ic.ac.uk", "position": "Associate Professor;PhD student", "bibtex": "@inproceedings{\npapagiannis2024miles,\ntitle={{MILES}: Making Imitation Learning Easy with Self-Supervision},\nauthor={Georgios Papagiannis and Edward Johns},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=y8XkuQIrvI}\n}", "github": "", "project": "", "reviewers": "onvu;3nym;zWvW;dHST", "site": "https://openreview.net/forum?id=y8XkuQIrvI", "pdf_size": 0, "rating": "3;3;3;4", "confidence": "4;3;4;4", "rating_avg": 3.25, "confidence_avg": 3.75, "replies_avg": 6, "authors#_avg": 2, "corr_rating_confidence": 0.3333333333333333, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=5803125416273343558&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0", "aff_unique_norm": "Imperial College London", "aff_unique_dep": "", "aff_unique_url": "https://www.imperial.ac.uk", "aff_unique_abbr": "ICL", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "United Kingdom" }, { "id": "yNQu9zqx6X", "title": "Robust Manipulation Primitive Learning via Domain Contraction", "track": "main", "status": "Poster", "tldr": "", "abstract": "Contact-rich manipulation plays an important role in everyday life, but uncertain parameters pose significant challenges to model-based planning and control. To address this issue, domain adaptation and domain randomization have been proposed to learn robust policies. However, they either lose the generalization ability to diverse instances or perform conservatively due to neglecting instance-specific information. In this paper, we propose a bi-level approach to learn robust manipulation primitives, including parameter-augmented policy learning using multiple models with tensor approximation, and parameter-conditioned policy retrieval through domain contraction. This approach unifies domain randomization and domain adaptation, providing optimal behaviors while keeping generalization ability. We validate the proposed method on three contact-rich manipulation primitives: hitting, pushing, and reorientation. The experimental results showcase the superior performance of our approach in generating robust policies for instances with diverse physical parameters.", "keywords": "Robust policy learning;Contact-rich manipulation;Sim-to-real", "primary_area": "", "supplementary_material": "/attachment/3ffea8c73040e3b4d4bef8f33ffc8f3400e381a9.zip", "author": "Teng Xue;Amirreza Razmjoo;Suhan Shetty;Sylvain Calinon", "authorids": "~Teng_Xue1;~Amirreza_Razmjoo1;~Suhan_Shetty1;~Sylvain_Calinon1", "gender": "M;M;M;M", "homepage": "https://schortenger.github.io/;https://suhannshetty.github.io/;https://calinon.ch/;", "dblp": "219/2424;https://dblp.org/rec/journals/trob/ShettySC22;59/6334;", "google_scholar": "https://scholar.google.com.hk/citations?user=sgFW7YwAAAAJ;yLf21MoAAAAJ;t7VnipMAAAAJ;yu3z4wcAAAAJ", "orcid": ";0000-0002-7550-9368;0000-0002-9036-6799;0000-0003-3826-6608", "linkedin": ";;sylvaincalinon;amir-razmjoo/?trk=public_profile_browsemap&originalSubdomain=ch", "or_profile": "~Teng_Xue1;~Suhan_Shetty1;~Sylvain_Calinon1;~Amirreza_Razmjoo_Fard1", "aff": "Idiap Research Institute;;EPFL - EPF Lausanne;EPFL - EPF Lausanne", "aff_domain": "idiap.ch;;epfl.ch;epfl.ch", "position": "PhD student;;Lecturer;PhD student", "bibtex": "@inproceedings{\nxue2024robust,\ntitle={Robust Manipulation Primitive Learning via Domain Contraction},\nauthor={Teng Xue and Amirreza Razmjoo and Suhan Shetty and Sylvain Calinon},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yNQu9zqx6X}\n}", "github": "", "project": "", "reviewers": "aceS;q6Sa;8Guh", "site": "https://openreview.net/forum?id=yNQu9zqx6X", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;4", "rating_avg": 3.0, "confidence_avg": 4.0, "replies_avg": 5, "authors#_avg": 4, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=10038342647500807215&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 5, "aff_unique_index": "0;1;1", "aff_unique_norm": "Idiap Research Institute;EPFL", "aff_unique_dep": ";", "aff_unique_url": "https://www.idiap.ch;https://www.epfl.ch", "aff_unique_abbr": "Idiap;EPFL", "aff_campus_unique_index": "1;1", "aff_campus_unique": ";Lausanne", "aff_country_unique_index": "0;0;0", "aff_country_unique": "Switzerland" }, { "id": "ySI0tBYxpz", "title": "Gaitor: Learning a Unified Representation Across Gaits for Real-World Quadruped Locomotion", "track": "main", "status": "Poster", "tldr": "", "abstract": "The current state-of-the-art in quadruped locomotion is able to produce a variety of complex motions. These methods either rely on switching between a discrete set of skills or learn a distribution across gaits using complex black-box models. Alternatively, we present Gaitor, which learns a disentangled and 2D representation across locomotion gaits. This learnt representation forms a planning space for closed-loop control delivering continuous gait transitions and perceptive terrain traversal. Gaitor\u2019s latent space is readily interpretable and we discover that during gait transitions, novel unseen gaits emerge. The latent space is disentangled with respect to footswing heights and lengths. This means that these gait characteristics can be varied independently in the 2D latent representation. Together with a simple terrain encoding and a learnt planner operating in the latent space, Gaitor can take motion commands including desired gait type and swing characteristics all while reacting to uneven terrain. We evaluate Gaitor in both simulation and the real world on the ANYmal C platform. To the best of our knowledge, this is the first work learning a unified and interpretable latent space for multiple gaits, resulting in continuous blending between different locomotion modes on a real quadruped robot. An overview of the methods and results in this paper is found at https://youtu.be/eVFQbRyilCA.", "keywords": "Representation Learning;Learning for Control;Quadruped Control", "primary_area": "", "supplementary_material": "/attachment/9d4c06c35c636e9babe1fd00198d30396d48bda4.zip", "author": "Alexander Luis Mitchell;Wolfgang Merkt;Aristotelis Papatheodorou;Ioannis Havoutis;Ingmar Posner", "authorids": "~Alexander_Luis_Mitchell1;~Wolfgang_Merkt1;~Aristotelis_Papatheodorou2;~Ioannis_Havoutis1;~Ingmar_Posner1", "gender": "M;;M;;", "homepage": ";http://www.wolfgangmerkt.com/;;;", "dblp": "268/8155;;;;59/542", "google_scholar": "https://scholar.google.co.uk/citations?user=7YV2TGMAAAAJ;WzpoCwkAAAAJ;08OrK4AAAAAJ;;dPk-iwsAAAAJ", "orcid": ";0000-0003-3235-4906;0000-0003-0290-7071;;0000-0001-6270-700X", "linkedin": ";;aristotelis-papatheodorou-2559ab127;;ingmar-posner-20b49a", "or_profile": "~Alexander_Luis_Mitchell1;~Wolfgang_Merkt1;~Aristotelis_Papatheodorou2;~Ioannis_Havoutis1;~Ingmar_Posner1", "aff": "University of Oxford;University of Oxford, University of Oxford;University of Oxford;;University of Oxford", "aff_domain": "oxford.ac.uk;robots.ox.ac.uk;ox.ac.uk;;ox.ac.uk", "position": "Postdoc;Postdoc;PhD student;;Full Professor", "bibtex": "@inproceedings{\nmitchell2024gaitor,\ntitle={Gaitor: Learning a Unified Representation Across Gaits for Real-World Quadruped Locomotion},\nauthor={Alexander Luis Mitchell and Wolfgang Merkt and Aristotelis Papatheodorou and Ioannis Havoutis and Ingmar Posner},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ySI0tBYxpz}\n}", "github": "", "project": "", "reviewers": "CBEd;sMp4;jDLg", "site": "https://openreview.net/forum?id=ySI0tBYxpz", "pdf_size": 0, "rating": "3;3;4", "confidence": "3;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=17279731130050539188&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0;0;0", "aff_unique_norm": "University of Oxford", "aff_unique_dep": "", "aff_unique_url": "https://www.ox.ac.uk", "aff_unique_abbr": "Oxford", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;0", "aff_country_unique": "United Kingdom" }, { "id": "yYujuPxjDK", "title": "Language-guided Manipulator Motion Planning with Bounded Task Space", "track": "main", "status": "Poster", "tldr": "", "abstract": "Language-based robot control is a powerful and versatile method to control a robot manipulator where large language models (LLMs) are used to reason about the environment. However, the generated robot motions by these controllers often lack safety and performance, resulting in jerky movements. In this work, a novel modular framework for zero-shot motion planning for manipulation tasks is developed. The modular components do not require any motion-planning-specific training. An LLM is combined with a vision model to create Python code that interacts with a novel path planner, which creates a piecewise linear reference path with bounds around the path that ensure safety. An optimization-based planner, the BoundMPC framework, is utilized to execute optimal, safe, and collision-free trajectories along the reference path. The effectiveness of the approach is shown on various everyday manipulation tasks in simulation and experiment, shown in the video at www.acin.tuwien.ac.at/42d2.", "keywords": "Vision Language Models;Manipulation Planning;Path-following MPC", "primary_area": "", "supplementary_material": "/attachment/9b09a989259134ec163cfb083205fa877c664d34.zip", "author": "Thies Oelerich;Christian Hartl-Nesic;Andreas Kugi", "authorids": "~Thies_Oelerich1;christian.hartl@tuwien.ac.at;~Andreas_Kugi1", "gender": "M;;", "homepage": ";;https://www.acin.tuwien.ac.at/en/", "dblp": ";;", "google_scholar": "JyP7xV8AAAAJ;;", "orcid": "0009-0009-1004-0173;;", "linkedin": "thies-oelerich-271916182/;;", "or_profile": "~Thies_Oelerich1;christian.hartl@tuwien.ac.at;~Andreas_Kugi1", "aff": "Technische Universit\u00e4t Wien;;Technische Universit\u00e4t Wien", "aff_domain": "tuwien.ac.at;;tuwien.ac.at", "position": "PhD student;;Full Professor", "bibtex": "@inproceedings{\noelerich2024languageguided,\ntitle={Language-guided Manipulator Motion Planning with Bounded Task Space},\nauthor={Thies Oelerich and Christian Hartl-Nesic and Andreas Kugi},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yYujuPxjDK}\n}", "github": "", "project": "", "reviewers": "1wvP;p2nF;fPqJ", "site": "https://openreview.net/forum?id=yYujuPxjDK", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;3;3", "rating_avg": 3.0, "confidence_avg": 3.0, "replies_avg": 5, "authors#_avg": 3, "corr_rating_confidence": 0.0, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=2191535196794554191&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;0", "aff_unique_norm": "Technische Universit\u00e4t Wien", "aff_unique_dep": "", "aff_unique_url": "https://www.tuwien.ac.at", "aff_unique_abbr": "TU Wien", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0", "aff_country_unique": "Austria" }, { "id": "ylZHvlwUcI", "title": "Theia: Distilling Diverse Vision Foundation Models for Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "Vision-based robot policy learning, which maps visual inputs to actions, necessitates a holistic understanding of diverse visual tasks beyond single-task needs like classification or segmentation. Inspired by this, we introduce Theia, a vision foundation model for robot learning that distills multiple off-the-shelf vision foundation models trained on varied vision tasks. Theia's rich visual representations encode diverse visual knowledge, enhancing downstream robot learning. Extensive experiments demonstrate that Theia outperforms its teacher models and prior robot learning models using less training data and smaller model sizes. Additionally, we quantify the quality of pre-trained visual representations and hypothesize that higher entropy in feature norm distributions leads to improved robot learning performance. Code, models, and demo are available at https://theia.theaiinstitute.com.", "keywords": "visual representation;robot learning;distillation;foundation model", "primary_area": "", "supplementary_material": "/attachment/6aa31483363bdd12c99f85ed1987e95849de1b21.zip", "author": "Jinghuan Shang;Karl Schmeckpeper;Brandon B. May;Maria Vittoria Minniti;Tarik Kelestemur;David Watkins;Laura Herlant", "authorids": "~Jinghuan_Shang1;~Karl_Schmeckpeper1;~Brandon_B._May1;~Maria_Vittoria_Minniti1;~Tarik_Kelestemur1;~David_Watkins2;~Laura_Herlant1", "gender": "M;;;F;;;F", "homepage": "https://www.cs.stonybrook.edu/~jishang;https://sites.google.com/view/karlschmeckpeper/;;;https://kelestemur.com/;;", "dblp": "218/7364;245/5630;;236/5652;;;", "google_scholar": "gMvLIDUAAAAJ;E2kpqtkAAAAJ;;zKWcawIAAAAJ;;;", "orcid": "0000-0001-7301-5981;0000-0003-4989-2022;;0000-0001-7272-0937;;;", "linkedin": ";;;maria-vittoria-minniti-355186141/;;;lauraherlant", "or_profile": "~Jinghuan_Shang1;~Karl_Schmeckpeper1;~Brandon_B._May1;~Maria_Vittoria_Minniti1;~Tarik_Kelestemur1;~David_Watkins2;~Laura_Herlant1", "aff": "Department of Computer Science, State University of New York, Stony Brook;The Robotics and AI Institute;;The AI Institute;Boston Dynamics AI Institute;;The Robotics and AI Institute", "aff_domain": "cs.stonybrook.edu;theaiinstitute.com;;theaiinstitute.com;theaiinstitute.com;;theaiinstitute.com", "position": "PhD student;Researcher;;Researcher;Researcher;;Researcher", "bibtex": "@inproceedings{\nshang2024theia,\ntitle={Theia: Distilling Diverse Vision Foundation Models for Robot Learning},\nauthor={Jinghuan Shang and Karl Schmeckpeper and Brandon B. May and Maria Vittoria Minniti and Tarik Kelestemur and David Watkins and Laura Herlant},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ylZHvlwUcI}\n}", "github": "https://github.com/bdaiinstitute/theia", "project": "", "reviewers": "GPK7;TPg9;CYjq", "site": "https://openreview.net/forum?id=ylZHvlwUcI", "pdf_size": 0, "rating": "2;3;3", "confidence": "3;4;4", "rating_avg": 2.6666666666666665, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.9999999999999998, "gs_citation": 18, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=7540596398725472565&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;2;3;1", "aff_unique_norm": "State University of New York;Robotics and AI Institute;AI Institute;Boston Dynamics AI Institute", "aff_unique_dep": "Department of Computer Science;;;AI Institute", "aff_unique_url": "https://www.stonybrook.edu;;;https://www.bostondynamics.com/", "aff_unique_abbr": "SUNY Stony Brook;;;BD AI", "aff_campus_unique_index": "0", "aff_campus_unique": "Stony Brook;", "aff_country_unique_index": "0;0;0", "aff_country_unique": "United States;" }, { "id": "ypaYtV1CoG", "title": "Vocal Sandbox: Continual Learning and Adaptation for Situated Human-Robot Collaboration", "track": "main", "status": "Poster", "tldr": "", "abstract": "We introduce Vocal Sandbox, a framework for enabling seamless human-robot collaboration in situated environments. Systems in our framework are characterized by their ability to *adapt and continually learn* at multiple levels of abstraction from diverse teaching modalities such as spoken dialogue, object keypoints, and kinesthetic demonstrations. To enable such adaptation, we design lightweight and interpretable learning algorithms that allow users to build an understanding and co-adapt to a robot's capabilities in real-time, as they teach new behaviors. For example, after demonstrating a new low-level skill for \"tracking around\" an object, users are provided with trajectory visualizations of the robot's intended motion when asked to track a new object. Similarly, users teach high-level planning behaviors through spoken dialogue, using pretrained language models to synthesize behaviors such as \"packing an object away\" as compositions of low-level skills -- concepts that can be reused and built upon. We evaluate Vocal Sandbox in two settings: collaborative gift bag assembly and LEGO stop-motion animation. In the first setting, we run systematic ablations and user studies with 8 non-expert participants, highlighting the impact of multi-level teaching. Across 23 hours of total robot interaction time, users teach 17 new high-level behaviors with an average of 16 novel low-level skills, requiring 22.1% less active supervision compared to baselines. Qualitatively, users strongly prefer Vocal Sandbox systems due to their ease of use (+31.2%), helpfulness (+13.0%), and overall performance (+18.2%). Finally, we pair an experienced system-user with a robot to film a stop-motion animation; over two hours of continuous collaboration, the user teaches progressively more complex motion skills to produce a 52 second (232 frame) movie. Videos & Supplementary Material: https://vocal-sandbox.github.io", "keywords": "Continual Learning;Multimodal Teaching;Human-Robot Interaction", "primary_area": "", "supplementary_material": "/attachment/50dba6172f63822834309d43227c36b3f2274b1e.zip", "author": "Jennifer Grannen;Siddharth Karamcheti;Suvir Mirchandani;Percy Liang;Dorsa Sadigh", "authorids": "~Jennifer_Grannen1;~Siddharth_Karamcheti1;~Suvir_Mirchandani1;~Percy_Liang1;~Dorsa_Sadigh1", "gender": ";M;M;;F", "homepage": "https://jenngrannen.com;http://siddkaramcheti.com/;http://suvirpmirchandani.com;https://cs.stanford.edu/~pliang/;https://dorsa.fyi/", "dblp": ";199/1922;287/4981;04/1701;117/3174", "google_scholar": "O5wWFpIAAAAJ;L5v2PHAAAAAJ;fz7LJPIAAAAJ;pouyVyUAAAAJ;ZaJEZpYAAAAJ", "orcid": ";;;;", "linkedin": ";;;;", "or_profile": "~Jennifer_Grannen1;~Siddharth_Karamcheti1;~Suvir_Mirchandani1;~Percy_Liang1;~Dorsa_Sadigh1", "aff": "Computer Science Department, Stanford University;Stanford University;Stanford University;Stanford University;Stanford University", "aff_domain": "cs.stanford.edu;stanford.edu;stanford.edu;stanford.edu;stanford.edu", "position": "PhD student;PhD student;PhD student;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\ngrannen2024vocal,\ntitle={Vocal Sandbox: Continual Learning and Adaptation for Situated Human-Robot Collaboration},\nauthor={Jennifer Grannen and Siddharth Karamcheti and Suvir Mirchandani and Percy Liang and Dorsa Sadigh},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=ypaYtV1CoG}\n}", "github": "", "project": "", "reviewers": "etUa;nEiK;gPa9", "site": "https://openreview.net/forum?id=ypaYtV1CoG", "pdf_size": 0, "rating": "3;3;4", "confidence": "2;4;4", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 5, "corr_rating_confidence": 0.5, "gs_citation": 0, "gs_cited_by_link": "https://scholar.google.com/scholar?q=related:O0xEHCaOOHIJ:scholar.google.com/&scioq=Vocal+Sandbox:+Continual+Learning+and+Adaptation+for+Situated+Human-Robot+Collaboration&hl=en&as_sdt=0,5", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "Computer Science Department", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "yqLFb0RnDW", "title": "Unpacking Failure Modes of Generative Policies: Runtime Monitoring of Consistency and Progress", "track": "main", "status": "Poster", "tldr": "", "abstract": "Robot behavior policies trained via imitation learning are prone to failure under conditions that deviate from their training data. Thus, algorithms that monitor learned policies at test time and provide early warnings of failure are necessary to facilitate scalable deployment. We propose Sentinel, a runtime monitoring framework that splits the detection of failures into two complementary categories: 1) Erratic failures, which we detect using statistical measures of temporal action consistency, and 2) task progression failures, where we use Vision Language Models (VLMs) to detect when the policy confidently and consistently takes actions that do not solve the task. Our approach has two key strengths. First, because learned policies exhibit diverse failure modes, combining complementary detectors leads to significantly higher accuracy at failure detection. Second, using a statistical temporal action consistency measure ensures that we quickly detect when multimodal, generative policies exhibit erratic behavior at negligible computational cost. In contrast, we only use VLMs to detect modes that are less time-sensitive. We demonstrate our approach in the context of diffusion policies trained on robotic mobile manipulation domains in both simulation and the real world. By unifying temporal consistency detection and VLM runtime monitoring, Sentinel detects 18% more failures than using either of the two detectors alone and significantly outperforms baselines, thus highlighting the importance of assigning specialized detectors to complementary categories of failure. Qualitative results are made available at sites.google.com/stanford.edu/sentinel.", "keywords": "Failure Detection;Generative Policies;Vision Language Models", "primary_area": "", "supplementary_material": "/attachment/d127df801ca6c5255c43ef736d1e127ba465514b.zip", "author": "Christopher Agia;Rohan Sinha;Jingyun Yang;Ziang Cao;Rika Antonova;Marco Pavone;Jeannette Bohg", "authorids": "~Christopher_Agia1;~Rohan_Sinha1;~Jingyun_Yang1;~Ziang_Cao2;~Rika_Antonova1;~Marco_Pavone1;~Jeannette_Bohg1", "gender": "M;;M;;;M;", "homepage": "https://www.chrisagia.com/;https://www.stanford.edu/;https://yjy0625.github.io;;;https://web.stanford.edu/~pavone/;https://web.stanford.edu/~bohg/", "dblp": "268/3555;;;;;91/3382-1.html;52/7377", "google_scholar": "t8Em5FwAAAAJ;;7XBAa2QAAAAJ;;;RhOpyXcAAAAJ;rjnJnEkAAAAJ", "orcid": "0000-0002-1208-2539;;;;;;0000-0002-4921-7193", "linkedin": "agiachris/;;;;;;", "or_profile": "~Christopher_Agia1;~Rohan_Sinha1;~Jingyun_Yang1;~Ziang_Cao2;~Rika_Antonova1;~Marco_Pavone1;~Jeannette_Bohg1", "aff": "Stanford University;Stanford University;Stanford University;;;Stanford University;Stanford University", "aff_domain": "stanford.edu;stanford.edu;stanford.edu;;;stanford.edu;stanford.edu", "position": "PhD student;PhD student;PhD student;;;Associate Professor;Assistant Professor", "bibtex": "@inproceedings{\nagia2024unpacking,\ntitle={Unpacking Failure Modes of Generative Policies: Runtime Monitoring of Consistency and Progress},\nauthor={Christopher Agia and Rohan Sinha and Jingyun Yang and Ziang Cao and Rika Antonova and Marco Pavone and Jeannette Bohg},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=yqLFb0RnDW}\n}", "github": "", "project": "", "reviewers": "wii5;emvf;nTL6", "site": "https://openreview.net/forum?id=yqLFb0RnDW", "pdf_size": 0, "rating": "3;3;3", "confidence": "3;2;3", "rating_avg": 3.0, "confidence_avg": 2.6666666666666665, "replies_avg": 5, "authors#_avg": 7, "corr_rating_confidence": 0.0, "gs_citation": 9, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=16236930872685393968&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 4, "aff_unique_index": "0;0;0;0;0", "aff_unique_norm": "Stanford University", "aff_unique_dep": "", "aff_unique_url": "https://www.stanford.edu", "aff_unique_abbr": "Stanford", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Stanford", "aff_country_unique_index": "0;0;0;0;0", "aff_country_unique": "United States" }, { "id": "zIWu9Kmlqk", "title": "LeLaN: Learning A Language-Conditioned Navigation Policy from In-the-Wild Video", "track": "main", "status": "Poster", "tldr": "", "abstract": "We present our method, LeLaN, which uses action-free egocentric data to learn robust language-conditioned object navigation. By leveraging the knowledge of large vision and language models and grounding this knowledge using pre-trained segmentation and depth estimation models, we can label in-the-wild data from a variety of indoor and outdoor environments with diverse instructions that capture a range of objects with varied granularity and noise in their descriptions. Leveraging this method to label over 50 hours of data collected in indoor and outdoor environments, including robot observations, YouTube video tours, and human-collected walking data allows us to train a policy that can outperform state-of-the-art methods on the zero-shot object navigation task in both success rate and precision.", "keywords": "Language-conditioned navigation policy;data augmentation", "primary_area": "", "supplementary_material": "/attachment/6ea7d11f58c0a455af3898cfd142b748f2726d8a.zip", "author": "Noriaki Hirose;Catherine Glossop;Ajay Sridhar;Oier Mees;Sergey Levine", "authorids": "~Noriaki_Hirose1;~Catherine_Glossop1;~Ajay_Sridhar1;~Oier_Mees1;~Sergey_Levine1", "gender": "M;F;M;M;M", "homepage": ";;https://ajaysridhar.com;https://www.oiermees.com/;https://people.eecs.berkeley.edu/~svlevine/", "dblp": "126/5605;;;190/8659;80/7594", "google_scholar": "https://scholar.google.co.jp/citations?user=xvOlfw8AAAAJ;;https://scholar.google.com/citations?hl=en;https://scholar.google.de/citations?user=sgsLkM0AAAAJ;8R35rCwAAAAJ", "orcid": ";;;;", "linkedin": ";catherineglossop;;oier-mees-a3069488;", "or_profile": "~Noriaki_Hirose1;~Catherine_Glossop1;~Ajay_Sridhar1;~Oier_Mees1;~Sergey_Levine1", "aff": "Toyota Central R&D Labs., Inc;University of California, Berkeley;University of California, Berkeley;Electrical Engineering & Computer Science Department, University of California, Berkeley;Google", "aff_domain": "mosk.tytlabs.co.jp;berkeley.edu;berkeley.edu;eecs.berkeley.edu;google.com", "position": "Researcher;PhD student;Undergrad student;Postdoc;Research Scientist", "bibtex": "@inproceedings{\nhirose2024lelan,\ntitle={LeLaN: Learning A Language-Conditioned Navigation Policy from In-the-Wild Video},\nauthor={Noriaki Hirose and Catherine Glossop and Ajay Sridhar and Oier Mees and Sergey Levine},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zIWu9Kmlqk}\n}", "github": "https://github.com/NHirose/learning-language-navigation", "project": "", "reviewers": "W8xr;J4Ez;DySg", "site": "https://openreview.net/forum?id=zIWu9Kmlqk", "pdf_size": 0, "rating": "3;3;3", "confidence": "4;4;3", "rating_avg": 3.0, "confidence_avg": 3.6666666666666665, "replies_avg": 5, "authors#_avg": 5, "corr_rating_confidence": 0.0, "gs_citation": 7, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14646023039077092695&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 3, "aff_unique_index": "0;1;1;1;2", "aff_unique_norm": "Toyota Central R&D Labs., Inc;University of California, Berkeley;Google", "aff_unique_dep": ";;Google", "aff_unique_url": "https://www.toyota-global.com;https://www.berkeley.edu;https://www.google.com", "aff_unique_abbr": "Toyota R&D;UC Berkeley;Google", "aff_campus_unique_index": "1;1;1;2", "aff_campus_unique": ";Berkeley;Mountain View", "aff_country_unique_index": "0;1;1;1;1", "aff_country_unique": "Japan;United States" }, { "id": "zeYaLS2tw5", "title": "Sparse Diffusion Policy: A Sparse, Reusable, and Flexible Policy for Robot Learning", "track": "main", "status": "Poster", "tldr": "", "abstract": "The increasing complexity of tasks in robotics demands efficient strategies for multitask and continual learning. Traditional models typically rely on a universal policy for all tasks, facing challenges such as high computational costs and catastrophic forgetting when learning new tasks. To address these issues, we introduce a sparse, reusable, and flexible policy, Sparse Diffusion Policy (SDP). By adopting Mixture of Experts (MoE) within a transformer-based diffusion policy, SDP selectively activates experts and skills, enabling task-specific learning without retraining the entire model. It not only reduces the burden of active parameters but also facilitates the seamless integration and reuse of experts across various tasks. Extensive experiments on diverse tasks in both simulators and the real world show that SDP 1) excels in multitask scenarios with negligible increases in active parameters, 2) prevents forgetting in continual learning new tasks, and 3) enables efficient task transfer, offering a promising solution for advanced robotic applications. More demos and codes can be found on our https://anonymous.4open.science/w/sparse_diffusion_policy-24E7/.", "keywords": "Robot Policy;Multitask;Continual learning;Mixture of Experts", "primary_area": "", "supplementary_material": "/attachment/97018e5cf3834ea2fd72fade1fd7b6e6a6d1eb89.zip", "author": "Yixiao Wang;Yifei Zhang;Mingxiao Huo;Thomas Tian;Xiang Zhang;Yichen Xie;Chenfeng Xu;Pengliang Ji;Wei Zhan;Mingyu Ding;Masayoshi Tomizuka", "authorids": "~Yixiao_Wang3;~Yifei_Zhang14;~Mingxiao_Huo1;~Thomas_Tian1;~Xiang_Zhang20;~Yichen_Xie1;~Chenfeng_Xu1;~Pengliang_Ji1;~Wei_Zhan2;~Mingyu_Ding1;~Masayoshi_Tomizuka2", "gender": "M;M;M;M;M;M;M;;;M;", "homepage": "https://github.com/YixiaoWang7;;;https://scholar.google.com/citations?user=uY4D8-wAAAAJ&hl=en&authuser=1;https://xiang-zhang-98.github.io/;;;;;https://dingmyu.github.io/;", "dblp": ";;;;;;65/1881;;;188/5243;", "google_scholar": "https://scholar.google.com/citations?hl=en;;;;;SdX6DaEAAAAJ;RpqvaTUAAAAJ;;;w4yTWwoAAAAJ;", "orcid": ";0009-0005-7212-0743;;;;;0000-0002-4941-6985;;;0000-0001-6556-8359;", "linkedin": "yixiao-wang-81aba7256/;;anthony-huo-3b68a5270/;;;;;;;dingmyu/;", "or_profile": "~Yixiao_Wang3;~Yifei_Zhang14;~Mingxiao_Huo1;~Thomas_Tian1;~Xiang_Zhang20;~Yichen_Xie1;~Chenfeng_Xu1;~Pengliang_Ji1;~Wei_Zhan2;~Mingyu_Ding1;~Masayoshi_Tomizuka2", "aff": "University of California, Berkeley;University of Chinese Academy of Sciences;Carnegie Mellon University;University of California, Berkeley;University of California, Berkeley;Waymo;University of California, Berkeley;;;University of California, Berkeley;", "aff_domain": "berkeley.edu;ucas.ac.cn;andrew.cmu.edu;berkeley.edu;berkeley.edu;waymo.com;berkeley.edu;;;berkeley.edu;", "position": "PhD student;Undergrad student;MS student;PhD student;PhD student;Intern;PhD student;;;Postdoc;", "bibtex": "@inproceedings{\nwang2024sparse,\ntitle={Sparse Diffusion Policy: A Sparse, Reusable, and Flexible Policy for Robot Learning},\nauthor={Yixiao Wang and Yifei Zhang and Mingxiao Huo and Thomas Tian and Xiang Zhang and Yichen Xie and Chenfeng Xu and Pengliang Ji and Wei Zhan and Mingyu Ding and Masayoshi Tomizuka},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zeYaLS2tw5}\n}", "github": "https://github.com/AnthonyHuo/SDP", "project": "", "reviewers": "1A92;bbJc;ntiJ", "site": "https://openreview.net/forum?id=zeYaLS2tw5", "pdf_size": 0, "rating": "3;3;4", "confidence": "4;3;3", "rating_avg": 3.3333333333333335, "confidence_avg": 3.3333333333333335, "replies_avg": 4, "authors#_avg": 11, "corr_rating_confidence": -0.49999999999999983, "gs_citation": 17, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=14465252141810948219&as_sdt=5,33&sciodt=0,33&hl=en", "gs_version_total": 4, "aff_unique_index": "0;1;2;0;0;3;0;0", "aff_unique_norm": "University of California, Berkeley;University of Chinese Academy of Sciences;Carnegie Mellon University;Waymo", "aff_unique_dep": ";;;", "aff_unique_url": "https://www.berkeley.edu;http://www.ucas.ac.cn;https://www.cmu.edu;https://www.waymo.com", "aff_unique_abbr": "UC Berkeley;UCAS;CMU;Waymo", "aff_campus_unique_index": "0;0;0;0;0", "aff_campus_unique": "Berkeley;", "aff_country_unique_index": "0;1;0;0;0;0;0;0", "aff_country_unique": "United States;China" }, { "id": "zr2GPi3DSb", "title": "Gentle Manipulation of Tree Branches: A Contact-Aware Policy Learning Approach", "track": "main", "status": "Poster", "tldr": "", "abstract": "Learning to interact with deformable tree branches with minimal damage is challenging due to their intricate geometry and inscrutable dynamics. Furthermore, traditional vision-based modelling systems suffer from implicit occlusions in dense foliage, severely changing lighting conditions, and limited field of view, in addition to having a significant computational burden preventing real-time deployment.In this work, we simulate a procedural forest with realistic, self-similar branching structures derived from a parametric L-system model, actuated with crude spring abstractions, mirroring real-world variations with domain randomisation over the morphological and dynamic attributes. We then train a novel Proprioceptive Contact-Aware Policy (PCAP) for a reach task using reinforcement learning, aided by a whole-arm contact detection classifier and reward engineering, without external vision, tactile, or torque sensing. The agent deploys novel strategies to evade and mitigate contact impact, favouring a reactive exploration of the task space. Finally, we demonstrate that the learned behavioural patterns can be transferred zero-shot from simulation to real, allowing the arm to navigate around real branches with unseen topology and variable occlusions while minimising the contact forces and expected ruptures.", "keywords": "Reinforcement Learning;Sim-to-Real;Deformable Manipulation", "primary_area": "", "supplementary_material": "/attachment/88ce7ae955b2cce40cd7c2db09e29da9c5c89100.zip", "author": "Jay Jacob;Shizhe Cai;Paulo Vinicius Koerich Borges;Tirthankar Bandyopadhyay;Fabio Ramos", "authorids": "~Jay_Jacob1;~Shizhe_Cai2;~Paulo_Vinicius_Koerich_Borges1;~Tirthankar_Bandyopadhyay1;~Fabio_Ramos1", "gender": ";M;M;M;M", "homepage": ";https://github.com/shizhec;http://www.csiro.au;;https://fabioramos.github.io/", "dblp": ";;;;22/2488", "google_scholar": ";;;MHPQL-QAAAAJ;https://scholar.google.com.au/citations?user=T_mJiHoAAAAJ", "orcid": ";;;;", "linkedin": ";;;;fabio-ramos-3256b421/", "or_profile": "~Jay_Jacob1;~Shizhe_Cai2;~Paulo_Vinicius_Koerich_Borges1;~Tirthankar_Bandyopadhyay1;~Fabio_Ramos1", "aff": ";University of Sydney;CSIRO;, CSIRO;NVIDIA", "aff_domain": ";uni.sydney.edu.au;csiro.au;data61.csiro.au;nvidia.com", "position": ";MS student;Principal Researcher;Researcher;Principal Research Scientist", "bibtex": "@inproceedings{\njacob2024gentle,\ntitle={Gentle Manipulation of Tree Branches: A Contact-Aware Policy Learning Approach},\nauthor={Jay Jacob and Shizhe Cai and Paulo Vinicius Koerich Borges and Tirthankar Bandyopadhyay and Fabio Ramos},\nbooktitle={8th Annual Conference on Robot Learning},\nyear={2024},\nurl={https://openreview.net/forum?id=zr2GPi3DSb}\n}", "github": "", "project": "", "reviewers": "DcS1;fjCp;7UhE;2pBB", "site": "https://openreview.net/forum?id=zr2GPi3DSb", "pdf_size": 0, "rating": "2;3;3;4", "confidence": "3;4;3;4", "rating_avg": 3.0, "confidence_avg": 3.5, "replies_avg": 6, "authors#_avg": 5, "corr_rating_confidence": 0.7071067811865476, "gs_citation": 3, "gs_cited_by_link": "https://scholar.google.com/scholar?cites=12391519939681742607&as_sdt=2005&sciodt=0,5&hl=en", "gs_version_total": 0, "aff_unique_index": "0;1;2;3", "aff_unique_norm": "University of Sydney;Commonwealth Scientific and Industrial Research Organisation;CSIRO;NVIDIA", "aff_unique_dep": ";;;NVIDIA Corporation", "aff_unique_url": "https://www.sydney.edu.au;https://www.csiro.au;https://www.csiro.au;https://www.nvidia.com", "aff_unique_abbr": "USYD;CSIRO;CSIRO;NVIDIA", "aff_campus_unique_index": "", "aff_campus_unique": "", "aff_country_unique_index": "0;0;0;1", "aff_country_unique": "Australia;United States" } ]